In [4]:
import pandas as pd
from pathlib import Path
import requests
from IPython.display import display, Markdown
from config import column_names

# load markdown from github repository https://github.com/moritzvitt/moritzProjekt/blob/markdown/system_japanese.md
# class BaseAnkiDeckGenerator:
#     def __init__(self, df: pd.DataFrame):
#         self.ai_input_df = self.df[['short_phrase', 'short_translation', 'word']]


target_language = "ger"
    # @log_io

def load_markdown():
    url1 = "https://raw.githubusercontent.com/moritzvitt/moritzProjekt/markdown/prompts/_general_prompt.md"

    examples_url = f"https://raw.githubusercontent.com/moritzvitt/moritzProjekt/markdown/prompts/{target_language}_examples.md"
    # furigana_url = f"https://raw.githubusercontent.com/moritzvitt/moritzProjekt/markdown/prompts/{target_language}_add_furigana.md"
    url2 = examples_url
    # url3 = furigana_url
    # url2 = "https://raw.githubusercontent.com/moritzvitt/moritzProjekt/markdown/add_furigana.md"
    
    response1 = requests.get(url1)
    markdown1 = response1.text

    response2 = requests.get(url2)
    markdown2 = response2.text

    # response3 = requests.get(url3)
    # markdown3 = response3.text

    merged_markdown = markdown1 + "\n" + markdown2 + "\n" 
    # + markdown3
    return merged_markdown

# load csv from test_dataframes 

# load dataframe from csv
df = pd.read_csv('../test_dataframes/japanese_items/items.csv', delimiter='\t', encoding='utf-8')
# print(df.head())
# print(df.shape)

# load column_names from config.py
df.columns = column_names

df = df[[
        "Word", 
        "Context",
        "Context machine translation",
        "Context human translation",
        ]]


from datetime import datetime

current_time = datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d_%H-%M")
print(formatted_time)

# run 
def run():
    markdown = load_markdown()
    display(Markdown(markdown))
    # also safe the markdown to a new file
    # create final.md name with random number at the end
    # add the date in an f string



    with open(f"final{target_language}{formatted_time}.md", "w") as file:
        file.write(markdown+'\n'+'\nThis is the table with the word sentence pairs: \n\n'+df.to_csv(sep='\t', encoding='utf-8', index=False))


if __name__ == "__main__":
    run()

2024-06-19_14-08


# TODO write a part the processes the audio images and cloze using regex. 

# IDENTITY and PURPOSE

You are a professional language teacher.. Your task is to provide concise, relevant information for sentence-word Anki flashcards, ensuring the student can effectively study vocabulary.

# TOOLS

You rely mainly on your capability as an LLM to predict the next string of characters. You don't need to analyze the table or anything.

# INPUT:

You will be given a csv containing sentence-word pairs from the Google extension 'LanguageReactor', containing the following columns:

- 'Word'
- 'Context'
- 'Context machine translation'
- ('Context human translation')

The 'Context' column contains a sentence in Spanish, the target language. The 'Word' column contains one word that appears in the 'Context' sentence.

! The delimiter in the csv is probably tab!

# Steps

1. ### Clean the data and check for parsing errors

Have a look at the table I provided you with. Don't use code for that, just rely on your prediction of characters as LLM.

- Remove unnecessary characters and correct weird formatting from 'Word' and 'Context'. However, pay attention that 'Word' always appears in 'Context'.
- Check each row to ensure the 'Context' sentence is correctly parsed. The 'Word' should include the entire vocabulary word, not just a fragment. Sometimes parsers may miss the whole verb or expression. Also, check the 'Context machine translation' to see if the 'Word' makes sense in its 'Context'. If there is a parsing error and 'Word' is incomplete, adjust 'Word' to match the vocabulary in 'Context'. Ensure 'Word' is formatted exactly as it appears in 'Context' (including capitalization, grammar, punctuation, and spelling errors if present).

2. ### Generate flashcard information

   To assist the student, generate a table containing following information for each row:


   1. Two or more synonyms for 'Word' based on its 'Context'.
   2. Two or more translations for 'Word' based on its 'Context'.
   3. A simple 'Example sentence' using 'Word'.
   4. The translation of the 'Example sentence'.
   5. A brief explanation of 'Word' in its 'Context'.
   6. A short explanation of the grammar

   When generating this information, stick to the following principles:

   - Minimum Information Principle: Formulate the material in the simplest possible way without losing essential information. That means you can safely omit conjunctions like 'or', 'and' and you don't need to say: 現実 means 'reality' or 'actuality'. Instead just say: 現実: reality, actuality.
   - Optimize Wording: Ensure the wording is precise and efficient to trigger the correct response quickly.
  
  The table should contain 7 columns with following column names:
   - 'Word'
   - 'Context'
   - 'Synonyms'
   - 'Translations'
   - 'Example'
   - 'Example translation'
   - 'Explanation'


# Output

# TODO output should be csv so that gpt can

Output the generated information as a Markdown table, including the column names as headers.  
- Do not include warnings or notes in the output—only the requested sections.
- Do not include additional information like 'here is the markdown table' or anything else. The only thing I want is the markdown table.

404: Not Found


In [3]:
import pandas as pd
from pathlib import Path
import requests
from IPython.display import display, Markdown
from config import column_names

# Base class for common functionalities
class BaseAnkiDeckGenerator:
    def __init__(self, csv_path: str, delimiter: str = '\t', encoding: str = 'utf-8'):
        self.df = pd.read_csv(csv_path, delimiter=delimiter, encoding=encoding)
        self.df.columns = column_names
        self.df = self.df[[
            "Word", 
            "Context",
            "Context machine translation",
            "Context human translation",
        ]]
        
    def display_markdown(self, markdown: str):
        display(Markdown(markdown))
        
    def save_markdown(self, markdown: str, file_name: str = "final.md"):
        with open(file_name, "w") as file:
            file.write(markdown + '\n\nThis is the table with the word sentence pairs:\n\n' + self.df.to_csv(sep='\t', encoding='utf-8', index=False))


# Japanese specific AnkiDeckGe nerator
class JapaneseAnkiDeckGenerator(BaseAnkiDeckGenerator):
    def __init__(self, csv_path: str):
        super().__init__(csv_path)
        self.markdown_urls = [
            "https://raw.githubusercontent.com/moritzvitt/moritzProjekt/markdown/prompts/_general_prompt.md",
            "https://raw.githubusercontent.com/moritzvitt/moritzProjekt/markdown/prompts/jn_examples.md"
        ]
        
    def load_markdown(self):
        markdowns = [requests.get(url).text for url in self.markdown_urls]
        return "\n".join(markdowns)

# Define other language-specific classes similarly if needed
# class FrenchAnkiDeckGenerator(BaseAnkiDeckGenerator):
#     ...

# Run the process
def run():
    # Path to the Japanese CSV file
    csv_path = '../test_dataframes/japanese_items/items.csv'
    japanese_generator = JapaneseAnkiDeckGenerator(csv_path)
    
    markdown = japanese_generator.load_markdown()
    japanese_generator.display_markdown(markdown)
    japanese_generator.save_markdown(markdown)

if __name__ == "__main__":
    run()


    WORD|胡椒|ja  Word        最後に塩と胡椒で味を整えてください  \
0   WORD|程度|ja  Word              最小限つなぎ程度にして   
1  WORD|最小限|ja  Word              最小限つなぎ程度にして   
2   WORD|刻む|ja  Word         まずゆで卵を適度に刻んでください   
3   WORD|適度|ja  Word         まずゆで卵を適度に刻んでください   
4  WORD|さすが|ja  Word  さすがにまだ卵サンドは食べられないみたいだけど   

                  Finally, add some salt and pepper.   胡椒 胡椒.1  Noun  \
0  We're going to make sure that we have the leas...   程度   程度  Noun   
1  We're going to make sure that we have the leas...  最小限  最小限  Noun   
2  First of all, you need to mark the eggs in mod...   刻ん   刻む  Verb   
3  First of all, you need to mark the eggs in mod...   適度   適度  Noun   
4           I can't seem to eat an egg sandwich yet.  さすが  さすが   Adv   

   Unnamed: 7                         pepper, black pepper  Netflix  ...  216  \
0         NaN                       degree, extent, amount  Netflix  ...  215   
1         NaN  minimum, minimum limit, minimum requirement  Netflix  ...  215   
2         NaN                

In [None]:



@log_io
def generate_anki_deck(df: pd.DataFrame) -> genanki.Package:
    """Generates an Anki deck from a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing card data.

    Returns:
        genanki.Package: The generated Anki package.
    """
    
    with open('templates/anki_card.html', 'r', encoding='utf-8') as content_file:
        content = content_file.read()

    # Splitting HTML content
    html_sections = content.split('<!-- html -->')

    # Assigning sections to qfmt, afmt, and css
    qfmt_html = html_sections[1]
    afmt_html = html_sections[2]

    with open('static/css/anki_card.css', 'r', encoding='utf-8') as content_file:
        css_code = content_file.read()

    # Ensure all columns are strings
    df = df.astype(str)

    # Define the Anki model
    model_id = 1607392319
    model = genanki.Model(
        model_id,
        'Language Learning with Netflix Model',
        fields = fields_config["fields"],
        templates=[
            {
                'name': 'Card 1',
                'qfmt': qfmt_html,
                'afmt': afmt_html,
            },
        ],
        css=css_code
    )

    # Create an Anki deck
    deck_id = model_id + 1  # Ensure deck_id is different from model_id
    deck = genanki.Deck(deck_id, 'lln_anki_deck')

    # Add cards to the deck
    for index, row in df.iterrows():
        my_note = genanki.Note(
            model=model,
            fields=[row['ID'], row['cloze'], row['hint'], row['definition'], row['notes'], row['image'], row['audio']],
        )
        deck.add_note(my_note)

    apkg_package = genanki.Package(deck)
    return apkg_package

@log_io
def export_df(df: pd.DataFrame, package: genanki.Package, native_language: str, output_file_path: str, encoding: str = 'utf-8') -> Tuple[str, str]:
    """Exports an Anki package and a cleaned DataFrame to CSV.

    Args:
        df (pd.DataFrame): The DataFrame to export.
        package (genanki.Package): The Anki package to save.
        native_language (str): The native language of the data.
        output_file_path (str): The path to save the files.
        encoding (str, optional): The encoding for the CSV file. Defaults to 'utf-8'.

    Returns:
        Tuple[str, str]: A tuple containing the paths to the exported Anki package and CSV file.
    """
    current_time = time.strftime("%Y%m%d%H%M%S", time.localtime())
    package_path = os.path.join(output_file_path, f'{native_language}_LLN_{current_time}.apkg')
    package.write_to_file(package_path)

    csv_file_path = os.path.join(output_file_path, f'{native_language}_LLN_{current_time}.csv')
    df.to_csv(csv_file_path, index=False, sep='\t', encoding=encoding)

    return package_path, csv_file_path
