In [1]:
pip install -q trl evaluate sacrebleu

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import T5ForConditionalGeneration,AutoTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import yaml
from datasets import load_from_disk
from transformers import TrainingArguments
from trl import SFTTrainer
import wandb
import numpy as np
from datasets import Dataset
from datasets import concatenate_datasets
import pandas as pd
import seaborn as sns
from datasets import load_dataset, concatenate_datasets
from transformers import DataCollatorForSeq2Seq
import evaluate

def load_model_and_tokenizer(model_name,tokenizer_name,device_map:str='auto'):

    model = T5ForConditionalGeneration.from_pretrained(
        model_name,
        )
    model.config.use_cache = False

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,trust_remote_code=True)

    tokenizer.padding_side='right'

    return model, tokenizer

2024-08-13 11:39:57.409515: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 11:39:57.409600: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 11:39:57.411098: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
model_name = 't5-base'
model, tokenizer = load_model_and_tokenizer(model_name,model_name)

In [4]:
train = pd.read_csv("/kaggle/input/melio-dataset/final_train_df.csv")
test = pd.read_csv("/kaggle/input/melio-dataset/final_val_df.csv")
train.head()

Unnamed: 0,dyu,fr
0,A bi ji min na,Il boit de l’eau.
1,A le dalakolontɛ lon bɛ.,Il se plaint toujours.
2,Mun? Fɛn dɔ.,Quoi ? Quelque chose.
3,O bɛ bi bɔra fo Gubeta.,Tous sortent excepté Gubetta.
4,A ale lo bi da bugɔ la!,Ah ! c’est lui… il sonne…


In [5]:
tokenizer.decode(tokenizer.encode('Å'))

'<unk></s>'

In [6]:
unknown_chars = [
    'Å', 'ɔ', 'Ê', 'ŋ', 'α', 'Á', 'Ô', 'ā', '̀', 'ú', '̂', 'Â', 'í', 'ò',
    '̧', 'Š', 'œ', 'Ō', 'ɲ', 'ë', 'ł', 'Ɛ', 'ñ', 'ū', 'ň', '́', 'ễ', 'Ɔ',
    'ʻ', 'Ç', 'ō', 'ï', 'Ɲ', 'ɛ', 'Č', 'À', 'ã'
]

mapper = {char: f'<extra_id_{i}>' for i, char in enumerate(unknown_chars)}
unmapper = {v: k for k, v in mapper.items()}

In [7]:
def unmap_unknown_chars(text, unmapper):
    """Unmap characters in a string based on the reverse mapper dictionary."""
    for extra_id, char in unmapper.items():
        text = text.replace(extra_id, char)
    return text

def map_unknown_chars(text, mapper):
    """Map unknown characters in the text to their corresponding extra_id tokens."""
    mapped_text = []
    for char in text:
        if char in mapper:
            mapped_text.append(mapper[char])
        else:
            mapped_text.append(char)
    return ''.join(mapped_text)

In [8]:
train['dyu'] = train['dyu'].apply(lambda x: map_unknown_chars(x, mapper))
train['fr'] = train['fr'].apply(lambda x: map_unknown_chars(x, mapper))

In [9]:
train.head()

Unnamed: 0,dyu,fr
0,A bi ji min na,Il boit de l’eau.
1,A le dalakolont<extra_id_33> lon b<extra_id_33>.,Il se plaint toujours.
2,Mun? F<extra_id_33>n d<extra_id_1>.,Quoi ? Quelque chose.
3,O b<extra_id_33> bi b<extra_id_1>ra fo Gubeta.,Tous sortent excepté Gubetta.
4,A ale lo bi da bug<extra_id_1> la!,Ah ! c’est lui… il sonne…


In [10]:
# load the dataset, we'll use the opus_books dataset
dataset = Dataset.from_pandas(train)
dataset = dataset.train_test_split(test_size=0.2)
dataset


DatasetDict({
    train: Dataset({
        features: ['dyu', 'fr'],
        num_rows: 6452
    })
    test: Dataset({
        features: ['dyu', 'fr'],
        num_rows: 1613
    })
})

In [11]:
# write the appropriate format before feeding to t5 model, also perform tokenization

source_lang = "dyu"
target_lang = "fr"
prefix = "translate from Dyula to French: "

def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=300, truncation=True)
    return model_inputs

In [12]:
# Formate and Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/6452 [00:00<?, ? examples/s]

Map:   0%|          | 0/1613 [00:00<?, ? examples/s]

In [13]:
# collate the dataset
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model = model)

In [14]:
# This is our metrics for calculating bleu score
metric = evaluate.load("sacrebleu")

In [15]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [16]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import numpy as np

# Calculate the number of training steps per epoch
epoch_steps = int(np.ceil(len(tokenized_dataset['train']) / 16))

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="dyu_fr_model",
    learning_rate=3e-5,  # Initial learning rate
    eval_steps=epoch_steps // 2,
    save_steps=epoch_steps,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=3,
    evaluation_strategy="steps",
    logging_steps=epoch_steps // 4,
    logging_strategy="steps",
    save_strategy="steps",
    save_total_limit=3,
    greater_is_better=True,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    warmup_steps=epoch_steps // 10,
    lr_scheduler_type='cosine',
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    label_smoothing_factor=0.1,
    optim="adafactor",

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]  # Stop if no improvement in 10 evaluations
)




In [17]:
# perform training
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmusamuhammadtukur127[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Bleu,Gen Len
202,4.4623,4.234174,0.4083,16.1978
404,4.341,4.122054,0.6362,16.1854
606,4.2228,4.067868,0.6501,16.3844
808,4.2113,4.035388,0.7832,16.3738
1010,4.1457,4.026258,0.6295,16.323


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1209, training_loss=4.292055494436259, metrics={'train_runtime': 1040.6486, 'train_samples_per_second': 18.6, 'train_steps_per_second': 1.162, 'total_flos': 996183746703360.0, 'train_loss': 4.292055494436259, 'epoch': 2.996282527881041})

In [19]:
LOCAL_SAVE_DIR = "dyu_to_fr_model"

In [20]:
trainer.tokenizer.save_pretrained(LOCAL_SAVE_DIR)
trainer.model.save_pretrained(LOCAL_SAVE_DIR)

In [None]:
prefix

In [None]:
text = train['dyu'].iloc[10]
text

In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch

model = T5ForConditionalGeneration.from_pretrained(LOCAL_SAVE_DIR)
tokenizer = AutoTokenizer.from_pretrained(LOCAL_SAVE_DIR)

inputs = tokenizer(prefix + text, max_length=300, truncation=True, return_tensors="pt")
outputs = model.generate(inputs.input_ids, max_new_tokens=40, do_sample=True, top_k=20, top_p=0.7)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

In [None]:
train['fr'].iloc[10]