In [5]:
import pandas as pd
import ollama
import re
from numpy import nan
import os


ollama_model = 'deepseek-r1:14b'

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

In [6]:
# Load the CSV file
vocab_df = pd.read_csv('extracted-vocab.csv')

# Sort the dataframe by 'number'
vocab_df = vocab_df.sort_values(by='number')

# Add an empty 'important' column that takes booleans
vocab_df['important'] = nan

# Display the first few rows of the dataframe
vocab_df.head()

Unnamed: 0,number,origin kanji,kanji,reading,particle_before,particle_after,meaning,usefulness,components,labels,description,important
0,1,一,もう一度,もういちど,,,one more time!,★★★★★,"<a class=""component"" href=""http://www.kanjidam...",,,
5,1,一,一番,いちばん,,,number one!,★★★★☆,"<a class=""component"" href=""http://www.kanjidam...","<a class=""label label-info"" href=""http://www.k...",<p>the best! Number one!</p>,
4,1,一,一日,ついたち,,,first day of the month,★★★★☆,"<a class=""component"" href=""http://www.kanjidam...","<a class=""label label-info"" href=""http://www.k...",,
2504,1,一,一*つ,ひとつ,,,one thing,★★★★☆,,,,
2,1,一,一人,ひとり,,で,"one person, alone",★★★★★,"<a class=""component"" href=""http://www.kanjidam...","<a class=""label label-info"" href=""http://www.k...",,


In [7]:
def process_answer(answer):
    """
    Processes the answer string ensuring it strictly follows the required format:
    
    Important: word1 word2 word3
    Not-important: word4 word5

    The answer may start with (or contain) a section enclosed with <think>...</think>, which should be ignored.

    Returns a tuple of two lists: (important_words, not_important_words).

    Raises:
        ValueError: If the answer does not strictly adhere to the format.
    """
    # Remove any <think>...</think> sections (they may span multiple lines).
    answer_cleaned = re.sub(r'<think>.*?</think>', '', answer, flags=re.DOTALL).strip()
    
    # Normalize the cleaned answer: split into non-empty lines.
    lines = [line for line in answer_cleaned.splitlines() if line.strip()]
    
    # Ensure there are exactly 2 non-empty lines after removing the <think> section.
    if len(lines) != 2:
        raise ValueError("The answer must contain exactly two lines (after removing any <think> section): one starting with 'Important:' and one with 'Not-important:'.")
    
    # Define expected prefixes.
    important_prefix = "Important:"
    not_important_prefix = "Not-important:"

    # Validate first line for "Important:" prefix.
    if not lines[0].startswith(important_prefix):
        raise ValueError("The first line must start with 'Important:'.")
    
    # Validate second line for "Not-important:" prefix.
    if not lines[1].startswith(not_important_prefix):
        raise ValueError("The second line must start with 'Not-important:'.")
    
    # Extract the words from each line.
    important_words = lines[0][len(important_prefix):].strip().split()
    not_important_words = lines[1][len(not_important_prefix):].strip().split()
    
    # Edge case: Ensure that at least one word is present in the Important list.
    if not important_words:
        raise ValueError("The Important list must contain at least one word.")
    
    # Check that there are no words that appear in both lists.
    overlap = set(important_words) & set(not_important_words)
    if overlap:
        raise ValueError(f"The following words appear in both lists: {', '.join(overlap)}")
    
    # (Optional) Additional validation: Ensure words only contain non-whitespace characters.
    word_pattern = re.compile(r"^\S+$")
    for word in important_words + not_important_words:
        if not word_pattern.match(word):
            raise ValueError(f"Word '{word}' is in an invalid format (contains whitespace or is empty).")
    
    return important_words, not_important_words

In [8]:
# Group the dataframe by the 'number' column
grouped = vocab_df.groupby('number')
max_nr = vocab_df['number'].max()

# Check if a progress file exists and load it
progress_file = 'progress.csv'
if os.path.exists(progress_file):
    vocab_df = pd.read_csv(progress_file)
else:
    vocab_df['important'] = nan

# Iterate through each group
for number, group in grouped:
    print(f"=== Number {number}/{max_nr}")
    word_dict = {}
    for index, row in group.iterrows():
        word_dict[index] = row['kanji'] + ' (' + row['reading'] + ', ' + str(row['usefulness'].count('★')) + ')'
    
    messages = [{
            'role': 'user',
            'content': f'''I am going to provide you with a list of japanese words and their readings and an importance score. All words contain a certain kanji.
I am going to create flashcards for the kanji and words. Your task is to sort the provided words.
Some of the words are going to be learned directly after the kanji, and others are going to be learned later.
The words that are learned directly after the kanji should only be words that are useful and words that represent important readings.
If a word is not useful or does not represent an important reading, it should be learned later.

All words may be classified as important. But for less important kanji it is acceptable to have less important words. Although at least one word should always be important to learn the most important reading of the word.

The words are: {', '.join(word_dict.values())}
The number represents an imporance score from 0 to 5. You may use this score or rely on your own judgement.
Respond strictly with this format:
Important: word1 word2 word3
Not-important: word4 word5

Where you return the kanji of the word. So if 一番 (いちばん, 5) is given to you and you judge it to be important you return 一番 in the important list. 
Make sure to respond with the word exactly like it was given to you and make sure to only respond with the two lists. This means including stars like * or ＊.
So if 一*つ (ひとつ, 4) is given to you and you judge it to be important you return 一*つ in the important list.'''
    }]
    
    response = ollama.chat(model=ollama_model, messages=messages)
    answer = response['message']['content']
    
    answer_cleaned = re.sub(r'<think>.*?</think>', '', answer, flags=re.DOTALL).strip()
    print(', '.join(word_dict.values()))
    
    print(answer_cleaned)
    
    try:
        important, not_important = process_answer(answer)
        print("Processed answer successfully:")
        print("Important words:", important)
        print("Not-important words:", not_important)
        
        for index, row in group.iterrows():
            if row['kanji'] in important:
                vocab_df.at[index, 'important'] = True
            elif row['kanji'] in not_important:
                vocab_df.at[index, 'important'] = False
            else:
                print(f"Error Word '{row['kanji']}' was not found in either list.")
        
        # Save progress to a file after processing each group
        vocab_df.to_csv(progress_file, index=False)
        
    except ValueError as e:
        print("Error processing answer:", e)
        print(answer)


=== Number 1/1759
もう一度 (もういちど, 5), 一番 (いちばん, 4), 一日 (ついたち, 4), 一*つ (ひとつ, 4), 一人 (ひとり, 5), 一緒 (いっしょ, 5), 一般的 (いっぱんてき, 5)
Important: もう一度 一番 一日 一人 一緒 一般的  
Not-important: 一＊つ
Processed answer successfully:
Important words: ['もう一度', '一番', '一日', '一人', '一緒', '一般的']
Not-important words: ['一＊つ']
Error Word '一*つ' was not found in either list.
=== Number 2/1759
二重 (にじゅう, 1), 二日 (ふつか, 4), 二*つ (ふたつ, 4), 二月 (にがつ, 5), 二人 (ふたり, 5), 二日酔い (ふつかよい, 4)
Important: 二重 二日 二人  
Not-important: 二*つ 二月 二日酔い
Processed answer successfully:
Important words: ['二重', '二日', '二人']
Not-important words: ['二*つ', '二月', '二日酔い']
=== Number 3/1759
三角 (さんかく, 2), 三*つ (みつ, 4), 三人 (さんにん, 5), 三月 (さんがつ, 5)
Important: 三*つ 三人  
Not-important: 三角 三月
Processed answer successfully:
Important words: ['三*つ', '三人']
Not-important words: ['三角', '三月']
=== Number 4/1759
完了 (かんりょう, 3), 了解 (りょうかい, 3), 終了 (しゅうりょう, 0)
Important: 完了 了解  
Not-important: 終了
Processed answer successfully:
Important words: ['完了', '了解']
Not-important words: ['終了']
=== N

In [9]:
vocab_df.to_csv('sorted-vocab.csv', index=False)