In [4]:
%reload_ext autoreload


In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import numpy as np
from typing import Generator, Any
import pandas as pd
import json
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os
from rich import print as rprint
from mbay_nmt.utils import domain as d
from mbay_nmt.utils.models import new_object_id
from datasets import load_dataset, Dataset, DatasetDict
from rich import print as rprint

load_dotenv()

True

In [7]:
# from huggingface_hub import notebook_login

# notebook_login()

## Huggingface datasets creation


In [8]:
from typing import Literal, TypedDict


class Record(TypedDict):
    id: str
    entry_id: str
    type: Literal["entry", "example", "expression"]
    mbay: str
    french: str
    english: str

In [25]:
CSV_DATASET_PATH = "../../datasets/mbay-translations-flattened.csv.gzip"
SPLIT_DATASET_PATH = "../../datasets/mbay-translations/"
TOKENIZED_DATASET_PATH = "../../datasets/mbay-translations-tokenized/byt5/"

In [12]:
df = pd.read_csv(CSV_DATASET_PATH, compression="gzip")

In [6]:
dst = Dataset.from_pandas(df)
# dst = dst.remove_columns(["__index_level_0__"])
dst

Dataset({
    features: ['id', 'entry_id', 'type', 'mbay', 'french', 'english'],
    num_rows: 10726
})

In [19]:
from transformers import AutoTokenizer
from mbay_nmt.fine_tune_mt5.utils import preprocess_records, format_prompt_output_pairs

t5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-large")



### Final check


In [None]:
dst[10]

{'id': '64eca312f6197fd20d76098e',
 'entry_id': '64eca312f6197fd20d762d04',
 'type': 'example',
 'mbay': 'ī-ɗāa àngérì nà̰ wétɨ́ ī-sō hólēe tɨ́ nò.',
 'french': 'Faites attention de ne pas tomber dans le trou.',
 'english': 'Be careful lest you fall into the hole.'}

In [8]:
format_prompt_output_pairs(dst[10:11])

NameError: name 'format_prompt_output_pairs' is not defined

In [11]:
dst_pairs = dst.map(
    format_prompt_output_pairs, batched=True, remove_columns=dst.column_names
)

Map:   0%|          | 0/10726 [00:00<?, ? examples/s]

In [13]:
import random

In [14]:
random.choices(range(len(dst_pairs)), k=10)

[37644, 8011, 18099, 31062, 41935, 11592, 37806, 20706, 23798, 12510]

In [15]:
sample_ids = random.choices(range(len(dst_pairs)), k=10)
dst_pairs[sample_ids]

{'inputs': ['Translate English to Mbay: soon, in a little while',
  'Translate English to Mbay: thoroughly (shrink, thin).',
  "Translate English to Mbay: go announce [s.o.'s] death",
  'Translate English to Mbay: maternal cousin',
  'Translate Mbay to French: kàdɨ̄',
  'Translate English to Mbay: A tradition is a custom of the people.',
  'Translate Mbay to French: hōŕ gìndɨ̄',
  'Translate Mbay to French: ngè-ndò̰o̰ à èl-m̄ tà kɨ́-dà mò̰y tɨ́ lò-ḿ.',
  'Translate French to Mbay: Ils ont mis des entraves aux chevilles du cheval.',
  'Translate Mbay to French: Dèē-kɨ́-dḛ̀ḛ́ kɨ́ ngàw kòy ndūr pútɨ́-pútɨ́.'],
 'targets': ['sḭ́ḭ-kɨ́-nòó-tɨ́',
  'mbḭ́yá̰',
  'àw̄ kɨ̀là-yòo lò [dèē]',
  'ngōn-nān',
  'donner',
  'Yá̰a̰ kōo-wēe ì yá̰a̰ gír lò dèē.',
  'morceau de tuyau en métal',
  'La voyante va me parler de ma maladie (expliquer sa cause).',
  'Lā-n̄ dálā kɔ́ɔ́-njà síndá tɨ́.',
  'La femme dont le mari est décédé se roula énergiquement dans le chagrin.']}

In [21]:
from functools import partial

tokenized_dst_check = dst.map(
    partial(preprocess_records, t5_tokenizer),
    batched=True,
    remove_columns=dst.column_names,
)

Map:   0%|          | 0/10726 [00:00<?, ? examples/s]

In [27]:
tokenized_dst_check

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 42904
})

In [66]:
def detokenize_batch(tokenizer: AutoTokenizer, batch: dict[str, list[list[int]]]):
    input_ids = batch["input_ids"]
    input_ids = np.where(input_ids != -100, input_ids, tokenizer.pad_token_id)
    decoded_input_ids = tokenizer.batch_decode(input_ids, skip_special_tokens=True)

    # labels = batch["labels"]
    # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return {
        "inputs": decoded_input_ids,
        # "labels": decoded_labels,
    }

In [67]:
detokenized_dst_check = tokenized_dst_check.map(
    partial(detokenize_batch, t5_tokenizer),
    batched=True,
    # remove_columns=dst.column_names,
)

Map:   0%|          | 0/42904 [00:00<?, ? examples/s]

In [69]:
detokenized_dst_check[sample_ids]["inputs"]

['Translate English to Mbay: soon, in a little while',
 'Translate English to Mbay: thoroughly (shrink, thin).',
 "Translate English to Mbay: go announce [s.o.'s] death",
 'Translate English to Mbay: maternal cousin',
 'Translate Mbay to French: kàdɨ̄',
 'Translate English to Mbay: A tradition is a custom of the people.',
 'Translate Mbay to French: hōŕ gìndɨ̄',
 'Translate Mbay to French: ngè-ndò̰o̰ à èl-m̄ tà kɨ́-dà mò̰y tɨ́ lò-ḿ.',
 'Translate French to Mbay: Ils ont mis des entraves aux chevilles du cheval.',
 'Translate Mbay to French: Dèē-kɨ́-dḛ̀ḛ́ kɨ́ ngàw kòy ndūr pútɨ́-pútɨ́.']

### Final dataset


In [43]:
train_test = dst.train_test_split(0.2)
test_valid = train_test["test"].train_test_split(0.5)

train_test_valid_dst = DatasetDict(
    {
        "train": train_test["train"],
        "test": test_valid["test"],
        "validation": test_valid["train"],
    }
)
train_test_valid_dst

DatasetDict({
    train: Dataset({
        features: ['id', 'entry_id', 'type', 'mbay', 'french', 'english'],
        num_rows: 8580
    })
    test: Dataset({
        features: ['id', 'entry_id', 'type', 'mbay', 'french', 'english'],
        num_rows: 1073
    })
    validation: Dataset({
        features: ['id', 'entry_id', 'type', 'mbay', 'french', 'english'],
        num_rows: 1073
    })
})

In [44]:
train_test_valid_dst["test"][2]

{'id': '64fc50f7286b18ef7de0bfb5',
 'entry_id': '',
 'type': 'entry',
 'mbay': 'ɓà-kùm-dàm-á',
 'french': 'personne aux grands yeux',
 'english': 'person with big eyes'}

In [21]:
train_test_valid_dst = DatasetDict.load_from_disk(SPLIT_DATASET_PATH)

In [22]:
train_test_valid_dst

DatasetDict({
    train: Dataset({
        features: ['id', 'entry_id', 'type', 'mbay', 'french', 'english'],
        num_rows: 8580
    })
    test: Dataset({
        features: ['id', 'entry_id', 'type', 'mbay', 'french', 'english'],
        num_rows: 1073
    })
    validation: Dataset({
        features: ['id', 'entry_id', 'type', 'mbay', 'french', 'english'],
        num_rows: 1073
    })
})

In [15]:
from transformers import AutoTokenizer

t5_tokenizer = AutoTokenizer.from_pretrained("google/byt5-large")

Downloading tokenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

In [16]:
train_test_valid_dst.column_names["train"]

['input_ids', 'attention_mask', 'labels']

In [23]:
from functools import partial
from mbay_nmt.fine_tune_mt5.utils import preprocess_records

final_dst = train_test_valid_dst.map(
    partial(preprocess_records, t5_tokenizer),
    batched=True,
    remove_columns=train_test_valid_dst["train"].column_names,
)

Map:   0%|          | 0/8580 [00:00<?, ? examples/s]

Map:   0%|          | 0/1073 [00:00<?, ? examples/s]

Map:   0%|          | 0/1073 [00:00<?, ? examples/s]

In [26]:
final_dst.save_to_disk(TOKENIZED_DATASET_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/34320 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4292 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4292 [00:00<?, ? examples/s]