In [None]:
%%capture
!git clone https://github.com/paolo-gajo/food.git
!pip install datasets
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
from google.colab import userdata
token = userdata.get('HF_TOKEN')

In [None]:
%cd ./food
import sys
sys.path.append('./src/word_alignment')

In [1]:
import warnings
import os
import torch
from tqdm.auto import tqdm
from datasets import DatasetDict
from evaluate import load
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from utils import data_loader, SquadEvaluator, TASTEset, XLWADataset, save_local_model, push_card
from datetime import datetime
from huggingface_hub import HfApi
import pandas as pd

model_name = 'bert-base-multilingual-cased'
# model_name = 'microsoft/mdeberta-v3-base'
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d_%H-%M-%S")

languages = [
    # 'ru',
    # 'nl',
    'it',
    # 'pt',
    # 'et',
    # 'es',
    # 'hu',
    # 'da',
    # 'bg',
    # 'sl',
]

lang_id = '-'.join(languages)

tokenizer = AutoTokenizer.from_pretrained(model_name)
print(tokenizer.vocab_size)

  from .autonotebook import tqdm as notebook_tqdm


119547


In [2]:
data_path = f'/home/pgajo/working/food/data/EW-TASTE_en-it_DEEPL.json'
results_path = f'/home/pgajo/working/food/results/tasteset/{lang_id}'
data = TASTEset.from_json(
        data_path,
        tokenizer_name = model_name,
        shuffle_languages=['it'],
        src_lang = 'en',
        dev_size = 0.2,
        shuffled_size = 0,
        unshuffled_size = 1,
        # drop_duplicates = False,
        # debug_dump = True,
        n_rows=200,
        )
batch_size = 8
dataset = data_loader(data,
                    batch_size,
                    )
device = 'cuda'

Creating samples (unshuffled)...: 100%|██████████| 200/200 [00:01<00:00, 184.75it/s]
Map: 100%|██████████| 712/712 [00:00<00:00, 5115.66 examples/s]
Map: 100%|██████████| 179/179 [00:00<00:00, 4583.18 examples/s]


In [3]:
data.raw_data['train'][0]['query']

'2 ounces gin (Plymouth);0.5 ounces cointreau;0.5 ounces lillet blanc (or • Cocchi Americano •);3 dashes absinthe'

In [12]:
from torch import nn
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Embedding(tokenizer.vocab_size, 512),
            # nn.MaxPool1d(3, 1),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 2)
        )

    def forward(self, **batch):
        batch = self.flatten(batch['input_ids'])
        logits = self.linear_relu_stack(batch)
        return logits

model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Embedding(119547, 512)
    (1): Linear(in_features=512, out_features=512, bias=True)
    (2): ReLU()
    (3): Linear(in_features=512, out_features=512, bias=True)
    (4): ReLU()
    (5): Linear(in_features=512, out_features=2, bias=True)
  )
)


In [14]:
for batch in dataset['train']:
    for key in batch.keys():
        batch[key] = batch[key].to(device)
    outputs = model(**batch)
    print(outputs, outputs.shape)
    break

tensor([[[ 0.0503, -0.0801],
         [ 0.0946, -0.0594],
         [ 0.0005, -0.0655],
         ...,
         [ 0.0511, -0.1056],
         [ 0.0511, -0.1056],
         [ 0.0511, -0.1056]],

        [[ 0.0503, -0.0801],
         [ 0.0251, -0.0750],
         [ 0.0291, -0.0503],
         ...,
         [ 0.0511, -0.1056],
         [ 0.0511, -0.1056],
         [ 0.0511, -0.1056]],

        [[ 0.0503, -0.0801],
         [-0.0083, -0.2299],
         [ 0.1184, -0.1218],
         ...,
         [ 0.0511, -0.1056],
         [ 0.0511, -0.1056],
         [ 0.0511, -0.1056]],

        ...,

        [[ 0.0503, -0.0801],
         [ 0.0598, -0.1037],
         [ 0.1109, -0.1295],
         ...,
         [ 0.0511, -0.1056],
         [ 0.0511, -0.1056],
         [ 0.0511, -0.1056]],

        [[ 0.0503, -0.0801],
         [ 0.0946, -0.0594],
         [-0.0312, -0.1064],
         ...,
         [ 0.0511, -0.1056],
         [ 0.0511, -0.1056],
         [ 0.0511, -0.1056]],

        [[ 0.0503, -0.0801],
       

In [16]:
batch['input_ids'].shape

torch.Size([8, 262])

In [18]:
embedding = nn.Embedding(tokenizer.vocab_size,512)
embedding

Embedding(119547, 512)

In [34]:
tokens = tokenizer('hello my name is paolo', return_tensors='pt')['input_ids'].squeeze()
print(tokens)
print([tokenizer.decode(token) for token in tokens])
embedding(tokens).shape

tensor([  101, 61694, 10133, 15127, 11324, 10124, 10931, 19139,   102])
['[CLS]', 'hell', '##o', 'my', 'name', 'is', 'pa', '##olo', '[SEP]']


torch.Size([9, 512])

In [None]:
lr = 3e-5
eps = 1e-8
optimizer = torch.optim.AdamW(params=model.parameters(),
                            lr=lr,
                            eps=eps
                            )

evaluator = SquadEvaluator(tokenizer,
                        model,
                        load("squad_v2"),
                        )

epochs = 1
for epoch in range(epochs):
    # train
    epoch_train_loss = 0
    model.train()
    split = 'train'
    progbar = tqdm(enumerate(dataset[split]),
                            total=len(dataset[split]),
                            desc=f"{split} - epoch {epoch + 1}")
    for i, batch in progbar:
        for key in batch.keys():
            batch[key].to(device)
        outputs = model(**batch) # ['loss', 'start_logits', 'end_logits']
        loss = outputs[0].mean()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        epoch_train_loss += loss.item()
        loss_tmp = round(epoch_train_loss / (i + 1), 4)
        progbar.set_postfix({'Loss': loss_tmp})
        
        evaluator.get_eval_batch(outputs, batch, split)

    evaluator.evaluate(model, split, epoch)
    epoch_train_loss /= len(dataset[split])
    evaluator.epoch_metrics[f'{split}_loss'] = epoch_train_loss

    evaluator.print_metrics(current_epoch = epoch, current_split = split)

    # eval on dev
    epoch_dev_loss = 0
    model.eval()
    split = 'dev'
    progbar = tqdm(enumerate(dataset[split]),
                            total=len(dataset[split]),
                            desc=f"{split} - epoch {epoch + 1}")
    for i, batch in progbar:
        with torch.inference_mode():
            outputs = model(**batch)
        loss = outputs[0].mean()
        epoch_dev_loss += loss.item()
        loss_tmp = round(epoch_dev_loss / (i + 1), 4)
        progbar.set_postfix({'Loss': loss_tmp})
        
        evaluator.get_eval_batch(outputs, batch, split)
    
    evaluator.evaluate(model, split, epoch)
    epoch_dev_loss /= len(dataset[split])
    evaluator.epoch_metrics[f'{split}_loss'] = epoch_dev_loss

    evaluator.print_metrics(current_epoch = epoch, current_split = split)

    evaluator.store_metrics()

    if evaluator.stop_training:
        print(f'Early stopping triggered on epoch {epoch}. \
            \nBest epoch: {evaluator.epoch_best}.')                                               
        break

evaluator.print_metrics()
if not os.path.isdir(results_path):
    os.makedirs(results_path)
evaluator.save_metrics_to_csv(results_path)

model_dict = {
    'bert-base-multilingual-cased': 'mbert',
    'microsoft/mdeberta-v3-base': 'mdeberta',
}

if not hasattr(data, 'unshuffled_size'):
    data.unshuffled_size = 1
if not hasattr(data, 'shuffled_size'):
    data.shuffled_size = 0
    
# model save folder
model_dir = './models'
save_name = f"{model_dict[model_name]}_{data.name}_U{data.unshuffled_size}_S{data.shuffled_size}_E{evaluator.epoch_best}_DEV{str(round(evaluator.exact_dev_best, ndigits=0))}_DROP{str(int(data.drop_duplicates))}"
save_name = save_name + "_test" # comment if not testing
model_save_dir = os.path.join(model_dir, f"{data.name}/{save_name}")
if not os.path.isdir(model_save_dir):
    os.makedirs(model_save_dir)
evaluator.save_metrics_to_csv(os.path.join(model_save_dir, 'metrics'))
save_local_model(model_save_dir, model, tokenizer)

repo_id = f"pgajo/{save_name}"
print('repo_id', repo_id)
api = HfApi(token = os.environ['HF_TOKEN'])
api.create_repo(repo_id)
df_desc = pd.DataFrame(evaluator.metrics).round(2)
df_desc.index += 1
df_desc.index.name = 'epoch'
df_desc = df_desc.to_markdown()
model_description = f'''
Model: {model_dict[model_name]}\n
Dataset: {data.name}\n
Unshuffled ratio: {data.unshuffled_size}\n
Shuffled ratio: {data.shuffled_size}\n
Best exact match epoch: {evaluator.epoch_best}\n
Best exact match: {str(round(evaluator.exact_dev_best, ndigits=2))}\n
Drop duplicates: {data.drop_duplicates}\n
Optimizer lr = {lr}\n
Optimizer eps = {eps}\n
Batch size = {batch_size}\n
Metrics:\n
{df_desc}
'''
push_card(repo_id=repo_id,
        model_name=model_name,
        model_description=model_description,
        )

In [None]:
api.upload_folder(repo_id=repo_id, folder_path=model_save_dir) 