In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
from typing import Generator, Any
import pandas as pd
import json
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os
from rich import print as rprint
# from mbay_dict.core import domain as d
# from mbay_dict.core.models import new_object_id
from datasets import load_dataset, Dataset

load_dotenv()

False

In [2]:
# from huggingface_hub import notebook_login

# notebook_login()

In [3]:
uri = os.environ["MONGODB_URI"]

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi("1"))

# Send a ping to confirm a successful connection
try:
    client.admin.command("ping")
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [6]:
entries = [
    d.Entry(**entry)
    for entry in client.get_database("dictionary").get_collection("entries-prod").find()
]

In [7]:
entries[0]

Entry(id=ObjectId('64fc50f7286b18ef7de0db22'), created_at=datetime.datetime(2023, 9, 9, 11, 3, 19, tzinfo=TzInfo(UTC)), updated_at=datetime.datetime(2023, 9, 9, 11, 3, 19, tzinfo=TzInfo(UTC)), headword='màn̄-kò̰o̰', part_of_speech='NI', sound_filename='Tape13MbayBB232.mp3', french=Translation(translation='inondation', key='i'), english=Translation(translation='flooding', key='f'), related_word=None, grammatical_note=None, examples=[Example(id=ObjectId('64fc50f7286b18ef7de0db23'), created_at=datetime.datetime(2023, 9, 9, 11, 3, 19, tzinfo=TzInfo(UTC)), updated_at=datetime.datetime(2023, 9, 9, 11, 3, 19, tzinfo=TzInfo(UTC)), parent_id=ParentId(id=ObjectId('64fc50f7286b18ef7de0db22'), type='entry'), mbay='Dà-nā̰a̰ màn̄-kò̰o̰ tɨ́', english=Translation(translation='during the season of flooding', key='d'), french=Translation(translation='pendant la saison des inondations', key='p'), sound_filename='Tape13MbayBB234.mp3')], expressions=[])

In [11]:
from typing import Literal, TypedDict


class Record(TypedDict):
    type: Literal["entry", "example", "expression"]
    mbay: str
    french: str
    english: str

In [14]:
entries[0]


def entry_to_records(entry: d.Entry) -> Generator[Record, Any, None]:
    yield {
        "type": "entry",
        "mbay": entry.headword,
        "french": entry.french.translation,
        "english": entry.english.translation,
    }

    for example in entry.examples:
        yield {
            "type": "example",
            "mbay": example.mbay,
            "french": example.french.translation,
            "english": example.english.translation,
        }

    for expression in entry.expressions:
        yield {
            "type": "expression",
            "mbay": expression.mbay,
            "french": expression.french.translation,
            "english": expression.english.translation,
        }


list(entry_to_records(entries[0]))

[{'type': 'entry',
  'mbay': 'màn̄-kò̰o̰',
  'french': 'inondation',
  'english': 'flooding'},
 {'type': 'example',
  'mbay': 'Dà-nā̰a̰ màn̄-kò̰o̰ tɨ́',
  'french': 'pendant la saison des inondations',
  'english': 'during the season of flooding'}]

In [15]:
records: list[Record] = []
for entry in entries:
    records.extend(entry_to_records(entry))

# Let's check the first few records
records[:5]

[{'type': 'entry',
  'mbay': 'màn̄-kò̰o̰',
  'french': 'inondation',
  'english': 'flooding'},
 {'type': 'example',
  'mbay': 'Dà-nā̰a̰ màn̄-kò̰o̰ tɨ́',
  'french': 'pendant la saison des inondations',
  'english': 'during the season of flooding'},
 {'type': 'entry',
  'mbay': 'màkɨ̀m',
  'french': "grands [d'activité]",
  'english': 'great ones [of activity]'},
 {'type': 'example',
  'mbay': 'ɓōo tɨ́ dá màkɨ̀m yá̰a̰-kɨ́-sà-gɨ̄ à òy-n̄ ngá̰y.',
  'french': 'Pendant les périodes de famine, les grands mangeurs meurent comme des mouches.',
  'english': 'During periods of famine, the great eaters die like flies.'},
 {'type': 'expression',
  'mbay': 'màkɨ̀m dèē',
  'french': 'un grand mangeur',
  'english': 'a great eater'}]

In [18]:
len(records) / len(entries)

2.0945046586803575

In [19]:
df = pd.DataFrame(records)
df.to_csv("records.csv", index=False)

In [21]:
dst = Dataset.from_csv("records.csv")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [24]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [35]:
tokenizer(["test"], text_target=["wow"])

{'input_ids': [[794, 1]], 'attention_mask': [[1, 1]], 'labels': [[2275, 210, 1]]}

In [56]:
from typing import Iterable

Lang = Literal["mbay", "french", "english"]

prefix = "Translate English to Mbay: "
source_lang: Lang = "english"
target_lang: Lang = "mbay"


def prepare_pair(examples, prefix: str, source_lang: Lang, target_lang: Lang):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    return inputs, targets


def preprocess_records(examples):
    inputs: list[str] = []
    targets: list[str] = []

    _inputs, _target = prepare_pair(
        examples, "Translate English to Mbay: ", "english", "mbay"
    )
    inputs.extend(_inputs)
    targets.extend(_target)

    _inputs, _target = prepare_pair(
        examples, "Translate Mbay to English: ", "mbay", "english"
    )
    inputs.extend(_inputs)
    targets.extend(_target)

    _inputs, _target = prepare_pair(
        examples, "Translate French to Mbay: ", "french", "mbay"
    )
    inputs.extend(_inputs)
    targets.extend(_target)

    _inputs, _target = prepare_pair(
        examples, "Translate Mbay to French: ", "mbay", "french"
    )
    inputs.extend(_inputs)
    targets.extend(_target)

    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=128, truncation=True
    )
    return model_inputs

In [57]:
tokenized_translations = dst.map(
    preprocess_records, batched=True, remove_columns=dst.column_names
)

Map:   0%|          | 0/11015 [00:00<?, ? examples/s]

In [58]:
tokenized_translations

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 44060
})

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

In [None]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result