In [2]:
import pandas as pd
import re
import string
import torch
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('./data/Train.csv' )

In [4]:
df.head()

Unnamed: 0,ID,Yoruba,English
0,ID_AAJEQLCz,A ṣètò Ìgbìmọ̀ Tó Ń Ṣètò Ìrànwọ́ Nígbà Àjálù l...,A Disaster Relief Committee was formed to orga...
1,ID_AASNedba,"Ìrọ̀lẹ́ May 22, 2018 ni wọ́n fàṣẹ ọba mú Arákù...",Brother Solovyev was arrested on the evening o...
2,ID_AAeQrhMq,Iléeṣẹ́ Creative Commons náà,Creative Commons the Organization
3,ID_AAxlMgPP,"Pè̩lú Egypt, Morocco àti Tunisia tí wó̩n ti lo...","With Egypt, Morocco and Tunisia out of the Wor..."
4,ID_ABKuMKSx,Adájọ́ àgbà lórílẹ̀ èdè Náíjíríà (Attorney Gen...,"The Attorney General of the Federation, Justic..."


In [5]:
# converting every letter to lower case
df['Yoruba'] = df['Yoruba'].apply(lambda x: str(x).lower())
df['English'] = df['English'].apply(lambda x: str(x).lower())

In [6]:
df.head()

Unnamed: 0,ID,Yoruba,English
0,ID_AAJEQLCz,a ṣètò ìgbìmọ̀ tó ń ṣètò ìrànwọ́ nígbà àjálù l...,a disaster relief committee was formed to orga...
1,ID_AASNedba,"ìrọ̀lẹ́ may 22, 2018 ni wọ́n fàṣẹ ọba mú arákù...",brother solovyev was arrested on the evening o...
2,ID_AAeQrhMq,iléeṣẹ́ creative commons náà,creative commons the organization
3,ID_AAxlMgPP,"pè̩lú egypt, morocco àti tunisia tí wó̩n ti lo...","with egypt, morocco and tunisia out of the wor..."
4,ID_ABKuMKSx,adájọ́ àgbà lórílẹ̀ èdè náíjíríà (attorney gen...,"the attorney general of the federation, justic..."


In [7]:
# removing apostrophe from the sentences
df['Yoruba'] = df['Yoruba'].apply(lambda x: re.sub("'","",x))
df['English'] = df['English'].apply(lambda x: re.sub("'","",x))
exclude = set(string.punctuation)

In [8]:
df.head()

Unnamed: 0,ID,Yoruba,English
0,ID_AAJEQLCz,a ṣètò ìgbìmọ̀ tó ń ṣètò ìrànwọ́ nígbà àjálù l...,a disaster relief committee was formed to orga...
1,ID_AASNedba,"ìrọ̀lẹ́ may 22, 2018 ni wọ́n fàṣẹ ọba mú arákù...",brother solovyev was arrested on the evening o...
2,ID_AAeQrhMq,iléeṣẹ́ creative commons náà,creative commons the organization
3,ID_AAxlMgPP,"pè̩lú egypt, morocco àti tunisia tí wó̩n ti lo...","with egypt, morocco and tunisia out of the wor..."
4,ID_ABKuMKSx,adájọ́ àgbà lórílẹ̀ èdè náíjíríà (attorney gen...,"the attorney general of the federation, justic..."


In [9]:
# removing digits from the sentences
digit = str.maketrans('','',string.digits)
df['Yoruba'] = df['Yoruba'].apply(lambda x: x.translate(digit))
df['English'] = df['English'].apply(lambda x: x.translate(digit))

In [10]:
df.head()

Unnamed: 0,ID,Yoruba,English
0,ID_AAJEQLCz,a ṣètò ìgbìmọ̀ tó ń ṣètò ìrànwọ́ nígbà àjálù l...,a disaster relief committee was formed to orga...
1,ID_AASNedba,"ìrọ̀lẹ́ may , ni wọ́n fàṣẹ ọba mú arákùnrin s...",brother solovyev was arrested on the evening o...
2,ID_AAeQrhMq,iléeṣẹ́ creative commons náà,creative commons the organization
3,ID_AAxlMgPP,"pè̩lú egypt, morocco àti tunisia tí wó̩n ti lo...","with egypt, morocco and tunisia out of the wor..."
4,ID_ABKuMKSx,adájọ́ àgbà lórílẹ̀ èdè náíjíríà (attorney gen...,"the attorney general of the federation, justic..."


In [11]:
# using pretrained model and then finetuning it on our dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-mul-en")

In [12]:
# optimize model for training, set learning rate to 0.0001
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0001)

In [13]:
def train_model():
    model.train()
    losses = 0
    X = df['Yoruba']
    y = df['English']
    max_epochs = 27
    n_batches = 32
    for epoch in tqdm(range(max_epochs)):
        for i in tqdm(range(125)):
            # making batches 
            local_X, local_y = X[i*n_batches:(i+1)*n_batches,], y[i*n_batches:(i+1)*n_batches,]
            # preparing the data according to the model input
            batch = tokenizer.prepare_seq2seq_batch(list(local_X),list(local_y),return_tensors='pt')
            output = model(**batch)
            # loss can be taken directly from the model output
            loss = output.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses = losses+loss
    average = losses/len(df)
    print('Loss: ' + str(average) )
    
    return model


In [None]:
model = train_model()

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]