# Fine-Tuning

## prepare data

In [3]:
from torch.utils.data import Dataset
from datasets import load_dataset

In [5]:
""" 
construct dataset
"""

# we don't really need this dataset class, here is only for demonstration
# in case the data are not from hugging face but othere sources

# For HF data sets, we can just load the data using load_dataset which will
# return the structure data.
# The loaded data can then be filtered, mapped using dataset methods
# For details, see "hg_transformers_datasets.ipynb"

class TranslationDataset(Dataset) :

    def __init__(self) :

        super().__init__()
        self.data = load_dataset("opus_books", 'en-fr', split='train')
        
    def __len__(self) :
        return len(self.data)
    
    def __getitem__(self, index) :

        return self.data[index]["translation"]['en'], self.data[index]["translation"]['fr']


In [6]:
# show some data in dataset

dataset = TranslationDataset()
for i in range(4) :
    print(dataset[i])

('The Wanderer', 'Le grand Meaulnes')
('Alain-Fournier', 'Alain-Fournier')
('First Part', 'PREMIÈRE PARTIE')
('I', 'CHAPITRE PREMIER')


In [7]:
"""
split dataset into train and valid sets
"""

from torch.utils.data import random_split

train_set, valid_set = random_split(dataset, lengths=[0.8, 0.2])

In [8]:
# 
len(train_set), len(valid_set)

(101668, 25417)

In [None]:
# we can also use the dataset function to split the dataset

# dataset = dataset.train_test_split(test_size=0.2)

In [16]:
"""
construct data loader
"""

from torch.utils.data import DataLoader

train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=1, shuffle=False)

In [10]:
# to show batch data in data loader

next(iter(train_loader))

[('"Nevertheless, Lord Glenarvan kept the promise which he had given.',
  "'I will work harder, then,' says I, 'and you shall have it all.'",
  'Mr. Bingley was unaffectedly civil in his answer, and forced his younger sister to be civil also, and say what the occasion required.',
  'After my broken and imperfect prayer was over, I drank the rum in which I had steeped the tobacco, which was so strong and rank of the tobacco that I could scarcely get it down; immediately upon this I went to bed.',
  'Depuis que la Fausta avait témoigné le désir d’un rendez-vous, toute cette chasse semblait bien longue à Fabrice.',
  'All the fires must be extinguished, so that nothing may betray the presence of men on the island."',
  'Bouteloup didn\'t even wait until the husband had gone!"',
  'To prevent my being known, I pulled off my blue apron, and wrapped the bundle in it, which before was made up in a piece of painted calico, and very remarkable; I also wrapped up my straw hat in it, and so put t

In [47]:
"""
preprocess data
"""

# to be more efficient, it is better to process the data in batch instead of one by one
# So, instead of adding the processing in the dataset construction, we use pass a processing function
# to the dataloader, which allows to process the data in batch

# return_tensor allows to transform the tokenized data into the format we'd like to use to train or do inference

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")

def preprocess(batch) :

    prefix = "translate English to French: "
    inputs = [prefix + example[0] for example in batch]
    targets = [example[1] for example in batch]
    model_inputs = tokenizer(inputs, text_target=targets, padding="max_length", max_length=500, truncation=True, return_tensors="pt")
    return model_inputs
    

In [48]:
# data loader with preprocessing function

train_loader = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=preprocess)
valid_loader = DataLoader(valid_set, batch_size=1, shuffle=False, collate_fn=preprocess)

In [49]:
# show some data

next(iter(train_loader))

{'input_ids': tensor([[13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0],
        ...,
        [13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[14976,    30,  1903,  ...,     0,     0,     0],
        [  283,     5,  4004,  ...,     0,     0,     0],
        [    3,   104,  3307,  ...,     0,     0,     0],
        ...,
        [    3,   104, 27889,  ...,     0,     0,     0],
        [24470,    15, 26182,  ...,     0,     0,     0],
        [  695,  4154,     6,  ...,     0,     0,     0]])}

## Prepare Model

In [53]:
# get model

from transformers import T5ForConditionalGeneration
import torch

model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

# use gpu is available
if torch.cuda.is_available():
    model = model.cuda(1) # 1 is optional, I used 1 since my cuda0 is busy

In [54]:
# get optimizer

from torch.optim import Adam

optimizer = Adam(model.parameters(), lr = 1e-4)

In [50]:
# training

def train(epochs=2, log_steps=100) :

    gSteps = 0

    for e in range(epochs) :
        model.train()

        for batch in train_loader :

            if torch.cuda.is_available():
                
                batch = {k: v.cuda(1) for k, v in batch.items()}

            output = model(**batch)

            optimizer.zero_grad()

            output.loss.backward()

            optimizer.step()

            if gSteps % log_steps == 0:

                print(f"epoch: {e}, steps: {gSteps}, loss: {output.loss.item()}")
            
            gSteps += 1

In [55]:
train()

epoch: 0, steps: 0, loss: 15.056625366210938
epoch: 0, steps: 100, loss: 0.3115119934082031
epoch: 0, steps: 200, loss: 0.2316114604473114
epoch: 0, steps: 300, loss: 0.1704372614622116
epoch: 0, steps: 400, loss: 0.15944263339042664
epoch: 0, steps: 500, loss: 0.17312923073768616
epoch: 0, steps: 600, loss: 0.12698674201965332
epoch: 0, steps: 700, loss: 0.23214466869831085
epoch: 0, steps: 800, loss: 0.15984416007995605
epoch: 0, steps: 900, loss: 0.18747538328170776
epoch: 0, steps: 1000, loss: 0.18290716409683228
epoch: 0, steps: 1100, loss: 0.16001208126544952
epoch: 0, steps: 1200, loss: 0.19968031346797943
epoch: 0, steps: 1300, loss: 0.19356799125671387
epoch: 0, steps: 1400, loss: 0.17190241813659668
epoch: 0, steps: 1500, loss: 0.17707660794258118
epoch: 0, steps: 1600, loss: 0.11719249933958054
epoch: 0, steps: 1700, loss: 0.17830918729305267
epoch: 0, steps: 1800, loss: 0.11448365449905396
epoch: 0, steps: 1900, loss: 0.20359036326408386
epoch: 0, steps: 2000, loss: 0.19085

## Inference

In [57]:
# For inference, we should do a iteration for search, which is out of scope here.
# The different techniques of search will be explored later.

# So for simplicity, we use pipeline for inference

from transformers import pipeline

pipe = pipeline("translation_xx_to_yy", model=model, tokenizer=tokenizer, device=1)

In [60]:
text = "translate English to French: Building a neural network model from scratch in PyTorch is easier than it sounds."
pipe(text)

Your input_length: 25 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)




[{'translation_text': 'Il est plus facile de construire un modèle de réseau neural à zéro dans PyT'}]