# Walk over markdown files in ../memory, extract links, and store in dataframe

In [2]:
import os
import pandas as pd
import re

# Define a function to extract tags and links from markdown file text
def extract_tags_links(text):
    links = re.findall(r"\[\[.*?\]\]", text)
    links = [link.strip("[[").strip("]]") for link in links]
    return links

# Create an empty list to store the results
data = []

# Loop through each markdown file in the memory folder
for filename in os.listdir("../data/memory"):
    if filename.endswith(".md"):
        # Read the file text
        with open(os.path.join("../data/memory", filename), "r") as f:
            text = f.read()

        # Extract the tags and links from the text
        links = extract_tags_links(text)

        # Add a dictionary to the list, removed md extension from filename
        data.append({"filename": filename[:-3], "text": text, "links": links})

# Convert the list of dictionaries to a dataframe
df = pd.DataFrame(data, columns=["filename", "text", "links"])

# Save the dataframe to a CSV file
df.to_csv("../data/memory_dataframe.csv", index=False)

# Print the resulting dataframe
df

Unnamed: 0,filename,text,links
0,Heretics of Dune,# Heretics of Dune\n\n![rw-book-cover](https:/...,"[Frank Herbert, Books, Psychedelics, SciFi]"
1,CSC 581,"\n---\n**Status::** #🗺️ \n**Tags::** [[MOC]], ...","[MOC, Winter 2023]"
2,Tensorflow Mac M1,## AAAh\n[Good post on SO](https://stackoverfl...,[Programming Notes]
3,Snowflake,## Snowflake\n\nHere's to you - You doubters o...,"[Poetry, My Writings]"
4,Ink and glass,[[What is Design?]]\n[[Strange Things (Slumber...,"[What is Design?, Strange Things (Slumber in T..."
...,...,...,...
410,Poetry,,[]
411,The Eve of Rosh Hashanah - Yehuda Amichai,[[Yehuda Amichai]]\n\n_The eve of Rosh Hashana...,"[Yehuda Amichai, Poetry]"
412,The Best I've Got,\nExplorative podcast where each episode an ep...,"[My Motivation is the Dream World, Important, ..."
413,Colonization is a bubble,"Colonization is in supply, singular ages, and ...","[Ideas, Biodiversity, Ecological Evolution]"


# Break up text into 512 token chunks and batches of length 96 - required by cohere

In [None]:
import cohere
import time
import configparser
import pandas as pd
from tqdm import tqdm


# create a ConfigParser object and read the configuration file
config = configparser.ConfigParser()
config.read('config.ini')

# get the API key from the configuration file
api_key = config.get('cohere', 'api_key')

def chunk_text(text, max_len=512):
    words = text.split()
    chunks = []
    chunk = []

    for word in words:
        if len(chunk) + len(word) + 1 <= max_len:
            chunk.append(word)
        else:
            chunks.append(' '.join(chunk))
            chunk = [word]

    if chunk:
        chunks.append(' '.join(chunk))

    return chunks

# Assuming df is the dataframe with columns 'filename' and 'text'
# and api_key is your cohere API key

co = cohere.Client(api_key)
embeddings = []

batch_size = 96
rate_limit_calls = 100
rate_limit_duration = 60

start_time = time.time()
call_count = 0

for i, row in tqdm(df.iterrows(), total=len(df)):
    text_chunks = chunk_text(row['text'])

    for chunk in text_chunks:
        embeddings.append({
            'filename': row['filename'],
            'index': i,
            'chunk_text': chunk,
            'embedding': None
        })

embedding_batches = [embeddings[i:i+batch_size] for i in range(0, len(embeddings), batch_size)]

# Embed all files and store in a csv (files broken into chunks have filename, index (1,2,n) in the column with a separate embedding for each)

In [None]:
for batch in embedding_batches:
    texts = [item['chunk_text'] for item in batch]
    response = co.embed(texts=texts, model='large', truncate='END')
    
    for i, embedding in enumerate(response.embeddings):
        batch[i]['embedding'] = embedding

    call_count += 1

    if call_count >= rate_limit_calls:
        elapsed_time = time.time() - start_time

        if elapsed_time < rate_limit_duration:
            time.sleep(rate_limit_duration - elapsed_time)

        start_time = time.time()
        call_count = 0

df_embeddings = pd.DataFrame(embeddings)

# adds links column
df_embeddings = df_embeddings.merge(df[['filename', 'links']], on='filename', how='left')
df_embeddings.to_csv('memory_embeddings.csv', index=False)