In [1]:
!pip install -q datasets transformers sentencepiece evaluate accelerate wandb

In [2]:
import random
from functools import partial

import numpy as np
import torch
import wandb
from datasets import load_dataset, Dataset
from evaluate import evaluator
from torch import autocast
from torch.cuda.amp import GradScaler
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from transformers import T5TokenizerFast, T5ForConditionalGeneration

In [3]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [4]:
%env WANDB_PROJECT=t5_translate_en_it

env: WANDB_PROJECT=t5_translate_en_it


In [5]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmeraxes[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
def load_opus_dataset(src, tgt, tokenizer):
    dataset = load_dataset("opus_euconst", f"{src}-{tgt}")

    dataset = dataset.shuffle(seed=42)

    dataset["validation"] = Dataset.from_dict(dataset["train"][:int(len(dataset["train"]) / 10)],
                                              features=dataset["train"].features)

    dataset["train"] = Dataset.from_dict(dataset["train"][int(len(dataset["train"]) / 10):],
                                         features=dataset["train"].features)

    def tokenization(sample):
        # TODO do not truncate validation
        # TODO add truncated tokens as new samples
        model_inputs = tokenizer(sample["translation"]["en"], padding=True, 
                                 truncation=True, max_length=100)

        labels = tokenizer(text_target=sample["translation"]["it"], padding=True, 
                           truncation=True, max_length=100)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    dataset = dataset.map(tokenization, batched=False, batch_size=None, remove_columns=["translation"])

    return dataset

In [8]:
def predict(model, tokenizer, sentence):
    temp = tokenizer.encode(sentence, return_tensors="pt").to(DEVICE)

    model.eval()
    with torch.no_grad():
        out = model.generate(temp)

    return tokenizer.decode(out[0], skip_special_tokens=True)

In [9]:
def evaluate(model, tokenizer):
    with open("../data/dataset/newssyscomb2009.en", "r") as file:
        data_en = file.read()

    with open("../data/dataset/newssyscomb2009.it", "r") as file:
        data_it = file.read()

    test_dataset = Dataset.from_dict({"text": data_en.split("\n"), "label": data_it.split("\n")})

    task_evaluator = evaluator("translation")

    results = task_evaluator.compute(
        model_or_pipeline=model,
        data=test_dataset,
        tokenizer=tokenizer,
        metric="bleu")
    return results

In [10]:
tokenizer = T5TokenizerFast.from_pretrained("t5-base")

model = T5ForConditionalGeneration.from_pretrained("t5-base").to(DEVICE)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [11]:
dataset = load_opus_dataset("en", "it", tokenizer)



  0%|          | 0/1 [00:00<?, ?it/s]



Map:   0%|          | 0/9066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1007 [00:00<?, ? examples/s]

In [12]:
config = {
    "lr": 5e-05,
    "epochs": 25,
    "batch_size": 32,
    "warmup_ratio": 0.2
}

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id
)

In [13]:
train_loader = DataLoader(
    dataset["train"],
    batch_size=config["batch_size"],
    collate_fn=data_collator,
    drop_last=False,
    num_workers=0,
    pin_memory=True
)

optimizer = AdamW(model.parameters(), lr=config["lr"], betas=(0.9, 0.999), eps=1e-08)

num_training_steps = float(len(train_loader) * config["epochs"])

num_warmup_steps = num_training_steps* config["warmup_ratio"]

def lr_lambda(x: float, warmup: float, total: float):
    return (x+1) / warmup if x < warmup else (total - x) / (total - warmup)

lr_scheduler = LambdaLR(optimizer, partial(lr_lambda, warmup=num_warmup_steps, 
                                            total=num_training_steps))

scaler = GradScaler()

In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    args=Seq2SeqTrainingArguments(output_dir="dummy_dir"),
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
)

In [15]:
def train_epoch(model, optimizer, scaler, lr_scheduler, train_loader):
    model.train()
    for step, inputs in tqdm(enumerate(train_loader), total=len(train_loader)):
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        with autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(**inputs)
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

        scaler.scale(outputs.loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        model.zero_grad()
    return outputs.loss.detach().cpu().item(), lr_scheduler.get_last_lr()

In [None]:
wandb.init(project="t5_translate_en_it", config=config)

model.zero_grad()
for epoch in range(config["epochs"]):
    train_loss, last_lr = train_epoch(model, optimizer, scaler, lr_scheduler, train_loader)
    eval_results = trainer.evaluate()
    log_dict = {"eval/loss": eval_results['eval_loss'],
                "train/loss": train_loss}
    print(log_dict)
    wandb.log(log_dict)
wandb.finish()

  0%|          | 0/284 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval/loss': 2.31345534324646, 'train/loss': 3.059263229370117}


  0%|          | 0/284 [00:00<?, ?it/s]

{'eval/loss': 1.0316932201385498, 'train/loss': 1.2824958562850952}


  0%|          | 0/284 [00:00<?, ?it/s]

{'eval/loss': 0.810147225856781, 'train/loss': 1.0639967918395996}


  0%|          | 0/284 [00:00<?, ?it/s]

{'eval/loss': 0.6570963859558105, 'train/loss': 0.9114643335342407}


  0%|          | 0/284 [00:00<?, ?it/s]

{'eval/loss': 0.5518368482589722, 'train/loss': 0.7564525604248047}


  0%|          | 0/284 [00:00<?, ?it/s]

{'eval/loss': 0.482863187789917, 'train/loss': 0.6933639645576477}


  0%|          | 0/284 [00:00<?, ?it/s]

{'eval/loss': 0.43730199337005615, 'train/loss': 0.5975403189659119}


  0%|          | 0/284 [00:00<?, ?it/s]

{'eval/loss': 0.4067593216896057, 'train/loss': 0.5226086378097534}


  0%|          | 0/284 [00:00<?, ?it/s]