# P01: Process notes

## 1 . Basic setup

In [1]:
import os
import json
import csv
import re

from openai import OpenAI
from dotenv import load_dotenv

##########
# Config #
##########

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

DATA_FOLDER = "data"
VOCAB_OUTPUT = "vocab.csv"

###################
# Vocab extractor #
###################

def extract_vocab_from_notes(notes_text):
    prompt = f"""
Here are my messy French notes from lessons. Note that there might not actually be any French in here! If it looks like there's no vocab, just return an empty string. Sometimes there isn't—some files aren't relevant.

But if there is, please extract each French word/concept/phrase.  
For each one, return JUST the word/concept/phrase—but if there's a specific or unusual English usage (for example an ambiguity), add that in brackets after the word/phrase.

Return a \\n-separated string. Each word/phrase on its own line. NO EXTRA TEXT. I'll process it later.

Here are my notes:
{notes_text}
"""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful French language assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content


Example:

In [10]:
text = '''souviens

proche de chez moi

C'était une vraie bénédiction
autrice / autrice

bonne note

confus - confusing

l'info de trop

c'est tres lent
ça va lentement

elle m'a demandé

on a eu

je ne la laisserai gagner
mechant

terminé

Don Juan

j'ai rien de prevu

ruisseau

abri - habitat

parmi - entre

rapide - fast
vite - quickly

brûlant - boiling

fragile / solide
sombre / illuminé / clair
pauvre / riche / aisé

cruel / charitable / empathique / bienveillant

rugueux / doux / douce

agité / calme

sale / propre
fermé / ouvert

metier - trades

architecte

institutrice / inst

mason - builder

- -> comptable'''

content = extract_vocab_from_notes(text)
content

"souviens  \nproche de chez moi  \nC'était une vraie bénédiction  \nautrice  \nbonne note  \nconfus (confusing)  \nl'info de trop  \nc'est tres lent  \nça va lentement  \nelle m'a demandé  \non a eu  \nje ne la laisserai gagner  \nmechant  \nterminé  \nj'ai rien de prevu  \nruisseau  \nabri (habitat)  \nparmi (entre)  \nrapide (fast)  \nvite (quickly)  \nbrûlant (boiling)  \nfragile  \nsolide  \nsombre  \nilluminé  \nclair  \npauvre  \nriche  \naisé  \ncruel  \ncharitable  \nempathique  \nbienveillant  \nrugueux  \ndoux  \ndouce  \nagité  \ncalme  \nsale  \npropre  \nfermé  \nouvert  \nmetier (trades)  \narchitecte  \ninstitutrice  \ninst  \nmason (builder)  \ncomptable"

In [12]:
items = [x.strip() for x in content.split('\n')]
items

['souviens',
 'proche de chez moi',
 "C'était une vraie bénédiction",
 'autrice',
 'bonne note',
 'confus (confusing)',
 "l'info de trop",
 "c'est tres lent",
 'ça va lentement',
 "elle m'a demandé",
 'on a eu',
 'je ne la laisserai gagner',
 'mechant',
 'terminé',
 "j'ai rien de prevu",
 'ruisseau',
 'abri (habitat)',
 'parmi (entre)',
 'rapide (fast)',
 'vite (quickly)',
 'brûlant (boiling)',
 'fragile',
 'solide',
 'sombre',
 'illuminé',
 'clair',
 'pauvre',
 'riche',
 'aisé',
 'cruel',
 'charitable',
 'empathique',
 'bienveillant',
 'rugueux',
 'doux',
 'douce',
 'agité',
 'calme',
 'sale',
 'propre',
 'fermé',
 'ouvert',
 'metier (trades)',
 'architecte',
 'institutrice',
 'inst',
 'mason (builder)',
 'comptable']

## 2. Read them all and save the results...

Keep a track of stuff that we've seen...

In [14]:
import glob
import pickle
import os

processed_suffix = '.processed.pkl'
data_dir = 'data'

all_vocab = []

for file in glob.glob(os.path.join(data_dir, '**/*'), recursive=True):
    
    if file.endswith(('.txt', '.md', '.csv')):
        processed_file = f'{file}{processed_suffix}'
        
        if not os.path.exists(processed_file):
            print(f'Reading {file}')
            content = open(file, 'r').read()
            content_processed = extract_vocab_from_notes(content)
            print(content_processed)
            pickle.dump(content_processed, open(processed_file, 'wb'))

        else:
            print(f'Loading {file}...')
            content_processed = pickle.load(open(processed_file, 'rb'))

        vocab = [x.strip() for x in content_processed.split('\n')]
        all_vocab += vocab

Loading data/caroline-2025-07-08/Week 25 9ea18f08c5214151a7ec55b20f4f1102.md...
Loading data/caroline-2025-07-08/Week 27 21c33c653adb80ec9390fe7bda0a492a.md...
Loading data/caroline-2025-07-08/Week 44 13033c653adb80c8bbc3ed592a702628.md...
Loading data/caroline-2025-07-08/Week 32 cabcd9762cbe40f1be8b8c6a3ada84b1.md...
Loading data/caroline-2025-07-08/Week 45 13733c653adb80f589fcd78f4e998407.md...
Loading data/caroline-2025-07-08/Week 10 1b333c653adb8040be5de722982e9201.md...
Loading data/caroline-2025-07-08/Week 48 14933c653adb80de9783eeaf64a419e1.md...
Loading data/caroline-2025-07-08/Week 28 2adc4fb095ef44f6826005e294c427d4.md...
Loading data/caroline-2025-07-08/Week 3 17b33c653adb8061b488fc2196c77287.md...
Loading data/caroline-2025-07-08/Week 46 13c33c653adb80e69df6d47730617b37.md...
Loading data/caroline-2025-07-08/Week 10 d8f31d450a7e4c4f824aa1416b584375.md...
Loading data/caroline-2025-07-08/Week 17 1e433c653adb80d5b39ccc771ae89f7c.md...
Loading data/caroline-2025-07-08/Week 41 

Now merge with existing vocab csv...

In [16]:
import pandas as pd

def merge_vocab_list_to_csv(vocab_list, csv_file):
    # Load existing vocab if the file exists
    if os.path.exists(csv_file):
        df = pd.read_csv(csv_file)
    else:
        df = pd.DataFrame(columns=['Entry', 'Notes', 'Mastery Score'])

    # Normalize: remove empty strings and duplicates in incoming vocab list
    vocab_set = set(filter(None, vocab_list))
    existing_entries = set(df['Entry'].astype(str))

    # Find new words to add
    new_words = vocab_set - existing_entries
    new_rows = pd.DataFrame({
        'Entry': list(new_words),
        'Notes': '',
        'Mastery Score': 0
    })

    # Append and remove any accidental duplicates
    combined_df = pd.concat([df, new_rows], ignore_index=True).drop_duplicates(subset=['Entry'])

    # Save back to CSV
    combined_df.to_csv(csv_file, index=False, encoding='utf-8')
    print(f"Added {len(new_words)} new words. Total vocab size: {len(combined_df)}.")

vocab_csv = 'vocab.csv'
merge_vocab_list_to_csv(all_vocab, vocab_csv)

Added 1307 new words. Total vocab size: 1307.


Now time for the chatbot!