# T5 for Cross-Language Plagiarism Detection

Author: João Phillipe Cardenuto

In this notebook we implement a model regarding the Detailed Analysis of the CLPD.

# Import Libraries

In [2]:
! pip install -q pytorch-lightning
! pip install -q transformers

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [1]:
# Comum libraries
import os
import random
from typing import Dict
from typing import List
import numpy as np
import pandas as pd
import re
from argparse import Namespace
from tqdm.notebook  import trange, tqdm_notebook

# Dataset
import sys
sys.path.insert(0, "/work/src/DataloaderCLPD/")
from LoadDataset import *

# Torch
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

# HugginFace
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Sklearn
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Tersorflow
import tensorboard
%load_ext tensorboard


# Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

# Setup seeds
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu" 

device = torch.device(dev)
print("Device",dev)

if "cuda" in dev:
    print("GPU: ", torch.cuda.get_device_name(0))



Device cuda:0
GPU:  Quadro RTX 5000


In [2]:
# Loading T5 Tokenizer with portuguese chars
port_tokenizer = T5Tokenizer.from_pretrained('t5-base')
extra_tokens = ['Ç','ç','À' , 'È' , 'Ì' , 'Ò' , 'Ù' , 'à' , 'è' , 'ì' , 'ò' , 'ù' , 'Á' , 'É' , 'Í' , 'Ó' , 'Ú' , 'á' , 'é' , 'í' , 'ó' , 'ú' , 'Â' , 'Ê' , 'Î' , 'Ô' , 'Û' , 'â' , 'ê' , 'î' , 'ô' , 'û'  , 'Ã' ,'Ẽ', 'Õ'  , 'ã', 'ẽ','õ' , 'Ë', 'ä' , 'ë' , 'ï' , 'ö' , 'ü']
new_tokens = {}
for i in extra_tokens: print(f'({i},{port_tokenizer.decode(port_tokenizer.encode(i))})',end=", ")
print("\n","*-"*10,"New Tokens","*-"*10)
for i in extra_tokens: 
    # Add_tokens return 0 if token alredy exist, and 1 if It doesnt.
    if  port_tokenizer.add_tokens(i):
        print(f"{i},{port_tokenizer.encode(text=i,add_special_tokens=False,)}", end=" |")
        new_tokens[i] = port_tokenizer.encode(text=i,add_special_tokens=False)[0]

(Ç, ⁇ ), (ç,ç), (À, ⁇ ), (È, ⁇ ), (Ì, ⁇ ), (Ò, ⁇ ), (Ù, ⁇ ), (à,à), (è,è), (ì, ⁇ ), (ò, ⁇ ), (ù,ù), (Á, ⁇ ), (É,É), (Í, ⁇ ), (Ó, ⁇ ), (Ú, ⁇ ), (á,á), (é,é), (í, ⁇ ), (ó,ó), (ú, ⁇ ), (Â, ⁇ ), (Ê, ⁇ ), (Î,Î), (Ô, ⁇ ), (Û, ⁇ ), (â,â), (ê,ê), (î,î), (ô,ô), (û,û), (Ã, ⁇ ), (Ẽ, ⁇ ), (Õ, ⁇ ), (ã, ⁇ ), (ẽ, ⁇ ), (õ, ⁇ ), (Ë, ⁇ ), (ä,ä), (ë, ⁇ ), (ï, ⁇ ), (ö,ö), (ü,ü), 
 *-*-*-*-*-*-*-*-*-*- New Tokens *-*-*-*-*-*-*-*-*-*-
Ç,[32100] |À,[32101] |È,[32102] |Ì,[32103] |Ò,[32104] |Ù,[32105] |ì,[32106] |ò,[32107] |Á,[32108] |Í,[32109] |Ó,[32110] |Ú,[32111] |í,[32112] |ú,[32113] |Â,[32114] |Ê,[32115] |Ô,[32116] |Û,[32117] |Ã,[32118] |Ẽ,[32119] |Õ,[32120] |ã,[32121] |ẽ,[32122] |õ,[32123] |Ë,[32124] |ë,[32125] |ï,[32126] |

## Loading Data

Using LoadDataset module to load capes dataset

In [3]:
# We using sample size of x it represent x*(2*N_negatives + 2) = x*(ENG + PT + ENG_NEGATIVE_1 ... + ENG_NEGATIVE_N +
#                                                                            PT_NEGATIVE_1 + ... + PT_NEGATIVE_N)
capes_dataset = CLPDDataset(name='capes',data_type='train',sample_size=1000,val_size=0.2,n_negatives=1)


train_capes, val_capes = capes_dataset.get_organized_data(tokenizer=port_tokenizer,tokenizer_type='t5')

HBox(children=(FloatProgress(value=0.0, description='Processing CAPES TRAIN', max=800.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Processing CAPES VALIDATION', max=200.0, style=ProgressSt…




In [4]:
# Samples
print(train_capes[0])
print("Number of Samples:", len(train_capes.pairs))

(tensor([31447,  7142,   536,    10, 18913,   825,     3,   390,    30, 11252,
        13850,     7,    13,  3294,     6, 15290,    11,   827,    19,   261,
        14286,    12,     3,     2,     3,  2905,  6724,  1433,   825,     3,
           18,     3,     2,    21, 18913,  4210,    13,  5798,  5834,    46,
         7513,   539,  1164,    11,   258,     6,    28,     8,  3053,    13,
           46, 22820,     5,  7142,   357,    10,   561,   825,    32,     3,
         5058,    51,  2975,  1225,    32,  1247,     9,    26,    32,     3,
           29,     9,     7,     3,    15,  4960,  8970, 32123,     3,    15,
            7, 12205,  4572,     9,     7,    20,  3294,     9,     6, 13500,
        15644,    15,    20,     3,  7168,    23,   297,    32,     3,    15,
            3, 26491,     3,   154, 18187,    26,    32,     3,     9,   509,
         6042,    26,    32,     3,     9,    32,   825,    32,    20,     3,
         2905,  6724,  8202,    29,  4915,     3,     2,     3,

In [8]:
dataloader_debug = DataLoader(train_capes, batch_size=10, shuffle=True, 
                              num_workers=0)

token_ids, attention_mask, token_type_ids, labels, _ = next(iter(dataloader_debug))
print('token_ids:\n', token_ids)
print('token_type_ids:\n', token_type_ids)
print('attention_mask:\n', attention_mask)
print('labels:\n', labels)

print('token_ids.shape:', token_ids.shape)
print('token_type_ids.shape:', token_type_ids.shape)
print('attention_mask.shape:', attention_mask.shape)
print('labels.shape:', labels.shape)

token_ids:
 tensor([[31447,  7142,   536,  ...,     0,     0,     0],
        [31447,  7142,   536,  ...,     0,     0,     0],
        [31447,  7142,   536,  ...,     0,     0,     0],
        ...,
        [31447,  7142,   536,  ...,     0,     0,     0],
        [31447,  7142,   536,  ...,    41,  3647,     6],
        [31447,  7142,   536,  ...,     0,     0,     0]])
token_type_ids:
 tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
attention_mask:
 tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])
labels:
 tensor([[209,   1,   0,  ...,   0,   0,   0],
        [  3, 632,   1,  ...,   0,   0,   0],
        [209,   1,   0,  ...,   0,   0,   0],
        .

In [4]:
batch_size = 2
# train_dataloader = DataLoader(dataset_train, batch_size=batch_size,
#                               shuffle=True, num_workers=4)

val_dataloader = DataLoader(val_capes, batch_size=batch_size, shuffle=True, 
                            num_workers=4)

test_dataloader = DataLoader(val_capes, batch_size=batch_size, shuffle=False, 
                            num_workers=4)

# test_dataloader = DataLoader(dataset_test, batch_size=batch_size,
#                              shuffle=False, num_workers=4)

In [37]:
label

tensor([[  3, 632,   1,  ...,   0,   0,   0],
        [  3, 632,   1,  ...,   0,   0,   0]])

In [33]:
port_tokenizer.decode(a[0])

''

In [53]:
port_tokenizer.encode(f"{0} {port_tokenizer.eos_token}",max_length=3, pad_to_max_length=True)

[3, 632, 1]

In [36]:
[valid_prediction(a[index],label[index])
                             for index in range(len(a))]

[(1, 0), (1, 0)]

In [33]:
a = mode2l(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), 
                    lm_labels=labels.to(device))[0]

NameError: name 'mode2l' is not defined

## T5-Model with Pytorch Lightning

In [3]:
def valid_prediction(pred,label):
    """
    Decode prediction and label. 
    
    Return ( prediction, label) if decode(pred) in {0,1},
            otherwise return (not label, label)
    """
    
    text_result = port_tokenizer.decode(pred)
    label = port_tokenizer.decode(label)
    
    # Check if string is numeric
    if text_result.replace('.','',1).isnumeric():
        value = float(text_result)
        if value == 1 or value == 0:
            return (int(value) , int(label))

    # Return a different number from the label
    return (int(not int(label)), int(label))


class T5Finetuner(pl.LightningModule):

    def __init__(self, hparams,train_dataloader,val_dataloader,test_dataloader):
        
        super(T5Finetuner, self).__init__()
        
        #Hiperparameters
        self.hparams = hparams
        
        self.experiment_name = f"{self.hparams.experiment_name}_{self.hparams.version}"

        # Dataloaders
        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader
        self._test_dataloader = test_dataloader
        # Learnning Rate and Loss Function
        self.learning_rate = hparams.learning_rate
        self.lossfunc = torch.nn.CrossEntropyLoss()
        # Optimizer
        self.optimizer = self.hparams.optimizer
        
        self.target_max_length = self.hparams.target_max_length

        # Retrieve model from Huggingface
        self.model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)
        

    def forward(self, input_ids, attention_mask, labels=None):
       
        # If labels are None, It will return a loss and a logit
        # Else it return the predicted logits for each sentence
        if self.training:

            # Ref https://huggingface.co/transformers/model_doc/t5.html#training
            loss =  self.model(input_ids=input_ids,
                          attention_mask=attention_mask,
                          lm_labels=labels)[0]
            return loss
        else:
            # REF https://huggingface.co/transformers/main_classes/model.html?highlight=generate#transformers.PreTrainedModel.generate
            predicted_token_ids = self.model.generate(
                                            input_ids=input_ids,
                                            max_length=self.target_max_length,
                                            do_sample=False,
                                            )
            return predicted_token_ids
        
    def training_step(self, batch, batch_nb):
        # batch
        input_ids, attention_mask, _, label,_ = batch
         
        # fwd
        loss = self(input_ids=input_ids.to(device),
                    attention_mask=attention_mask.to(device),
                    labels=label.to(device))
        
        # logs
        tensorboard_logs = {'train_loss': loss.item()}
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_nb):
        # batch
        input_ids, attention_mask, _, labels,_ = batch
         
        # fwd
        predicted_token_ids = self(input_ids.to(device), attention_mask=None,)
        pred_true_decoded = [valid_prediction(predicted_token_ids[index],labels[index])
                             for index in range(len(predicted_token_ids))]
        
        y_pred = [y[0] for y in pred_true_decoded]
        y_true = [y[1] for y in pred_true_decoded]
        return {'y_pred': y_pred, 'y_true': y_true}

    def validation_epoch_end(self, outputs):
        
        y_true = np.array([ y for x in outputs for y in x['y_true'] ])
        y_pred = np.array([ y for x in outputs for y in x['y_pred'] ])
        
        val_f1 =  f1_score(y_pred=y_pred, y_true=y_true)
        
        val_f1 = torch.tensor(val_f1)
        
        tensorboard_logs = {'val_f1': val_f1 }
        
        return {'val_f1': val_f1,  
                'progress_bar': tensorboard_logs, "log": tensorboard_logs}

    def test_step(self, batch, batch_nb):
        input_ids, attention_mask, _ , labels, pairs = batch
        
        predicted_token_ids = self(input_ids.to(device), attention_mask=None)
        
        pred_true_decoded = [valid_prediction(predicted_token_ids[index],labels[index])
                             for index in range(len(predicted_token_ids))]        
        y_pred = [y[0] for y in pred_true_decoded]
        y_true = [y[1] for y in pred_true_decoded]

        return {'pairs': pairs, 'y_true': y_true, 'y_pred':y_pred }

    def test_epoch_end(self, outputs):
        
        
        pairs = [pair for x in outputs for pair in x['pairs']]
        y_true = np.array([ y for x in outputs for y in x['y_true'] ])
        y_pred = np.array([ y for x in outputs for y in x['y_pred'] ])
        
        # Write failure on file
        with open (f"FAILURE_{self.experiment_name}.txt", 'w') as file:
               for index,pair in enumerate(pairs):
                    if y_true[index] != y_pred[index]:
                        file.write("="*50+f"\n[Y_TRUE={y_true[index]} != Y_PRED={y_pred[index]}]\n"+pair \
                                  +'\n'+"="*50+'\n')
        
        print("CONFUSION MATRIX:")
        print(confusion_matrix(y_true=y_true, y_pred=y_pred))
        
        print("SKLEARN  REPORT")
        print(classification_report(y_true=y_true, y_pred=y_pred))
        
        
        test_f1 =  f1_score(y_pred=y_pred, y_true=y_true)
    
        tensorboard_logs = {'test_f1': test_f1}
        return {'test_f1': test_f1, 'log': tensorboard_logs,
                 'progress_bar': tensorboard_logs}
    
    def configure_optimizers(self):

        optimizer =  self.optimizer(
            [p for p in self.parameters() if p.requires_grad],
            lr=self.learning_rate)
        
        scheduler = StepLR(optimizer, step_size=self.hparams.steplr_epochs, gamma=self.hparams.scheduling_factor)

        return [optimizer], [scheduler]

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

    def test_dataloader(self):
        return self._test_dataloader

In [6]:

hyperparameters = {"experiment_name": "T5-DEBUG", 
                   "max_epochs": 2,
                   "patience": 4,
                   "optimizer": torch.optim.Adam,
                   "target_max_length": 10,
                   "scheduling_factor": 0.8, 
                   "learning_rate": 1e-5, 
                   "steplr_epochs":4,
                  }

In [7]:
model = T5Finetuner(hparams=Namespace(**hyperparameters),
                      train_dataloader=val_dataloader,
                      val_dataloader=test_dataloader,
                      test_dataloader=test_dataloader)

##  Number of Parameter 

In [8]:
sum([torch.tensor(x.size()).prod() for x in model.parameters() if x.requires_grad]) # trainable parameters

tensor(222903936)

## Fast dev run

In [9]:
trainer = pl.Trainer(gpus=1, 
                     logger=False,
                     checkpoint_callback=False,  # Disable checkpoint saving.
                     fast_dev_run=True,
                     amp_level='O2', use_amp=False
)
trainer.fit(model)
trainer.test(model)
del model 

Running in fast_dev_run mode: will run a full train, val and test loop using a single batch
GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | lossfunc                                                              | CrossEntropyLoss           | 0     
1   | model                                                                 | T5ForConditionalGeneration | 222 M 
2   | model.shared                                                          | Embedding                  | 24 M  
3   | model.encoder                                                         | T5Stack                    | 109 M 
4   | model.encoder.block                                                   | ModuleList                 | 8

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[0 0]
 [2 0]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.0}
--------------------------------------------------------------------------------



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Overfit on a Batch

We notice that easily the model can overfit on a batch

In [7]:
hyperparameters = {"experiment_name": "T5CLPD", 
                   "optimizer": torch.optim.Adam,
                   "target_max_length": 3,
                   "max_epochs": 5,
                   "patience": 4,
                    "steplr_epochs":5,
                    "scheduling_factor": 0.95,
                    "learning_rate": 6e-5,
                    "max_length": 200
                   }
trainer = pl.Trainer(gpus=1,
                     logger=False,
                     max_epochs=hyperparameters['max_epochs'],
                     check_val_every_n_epoch=5,
                     checkpoint_callback=False,  # Disable checkpoint saving
                     overfit_pct=0.5,
                     amp_level='O2', use_amp=False)

model = T5Finetuner(hparams=Namespace(**hyperparameters),
                      train_dataloader=val_dataloader,
                      val_dataloader=test_dataloader,
                      test_dataloader=test_dataloader)

trainer.fit(model)
trainer.test(model)


# del model  
# del trainer

GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | lossfunc                                                              | CrossEntropyLoss           | 0     
1   | model                                                                 | T5ForConditionalGeneration | 222 M 
2   | model.shared                                                          | Embedding                  | 24 M  
3   | model.encoder                                                         | T5Stack                    | 109 M 
4   | model.encoder.block                                                   | ModuleList                 | 84 M  
5   | model.encoder.block.0                                                 | T5Block 

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[190  10]
 [  1 199]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       200
           1       0.95      0.99      0.97       200

    accuracy                           0.97       400
   macro avg       0.97      0.97      0.97       400
weighted avg       0.97      0.97      0.97       400

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.9731051344743276}
--------------------------------------------------------------------------------



In [9]:
train_capes[0]

(tensor([2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Training

In [4]:
# Training will perform a cross-dataset.
# Training on Capes testing on SciElo


max_length =  200
capes_dataset = CLPDDataset(name='capes',data_type='train',sample_size=100000,val_size=0.2,max_length=200,n_negatives=1)
capes_test = CLPDDataset(name='capes',data_type='test',n_negatives=1,max_length=200)

# T5 Tokenizer with portuguese chars
tokenizer = port_tokenizer

# Traning data 
train_capes, val_capes = capes_dataset.get_organized_data(tokenizer=tokenizer,tokenizer_type='t5')
test_capes = capes_test.get_organized_data(tokenizer=tokenizer,tokenizer_type='t5')


HBox(children=(FloatProgress(value=0.0, description='Processing CAPES TRAIN', max=80000.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='Processing CAPES VALIDATION', max=20000.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Processing CAPES TEST', max=7903.0, style=ProgressStyle(d…




In [5]:
len(test_capes)

47418

In [None]:
#------tester-----------#
#   DataLoaders     #
#-------------------#

batch_size = 32

train_dataloader = DataLoader(train_capes, batch_size=batch_size,
                              shuffle=True, num_workers=4)

val_dataloader = DataLoader(val_capes, batch_size=batch_size, shuffle=False, 
                            num_workers=4)

test_dataloader = DataLoader(test_capes, batch_size=batch_size,
                             shuffle=False, num_workers=4)



# Hiperparameters
hyperparameters = {"experiment_name": "T5-CAPES", 
                   "optimizer": torch.optim.Adam,
                   "target_max_length": 3,
                   "max_epochs": 3,
                   "patience": 4,
                    "steplr_epochs":1,
                    "scheduling_factor": 0.95,
                    "learning_rate": 6e-5,
                    "max_length": max_length,
                    'batch_size': batch_size
                   }
#------------------------------#
#       Checkpoints            #
#------------------------------#

log_path = 'logs'
ckpt_path = os.path.join(log_path, hyperparameters["experiment_name"], "-{epoch}-{val_f1:.2f}")  
checkpoint_callback = ModelCheckpoint(prefix=hyperparameters["experiment_name"],  # prefixo para nome do checkpoint
                                      filepath=ckpt_path,  # path onde será salvo o checkpoint
                                      monitor="val_f1", 
                                      mode="max",
                                      save_top_k=2)   
# Hard coded
# resume_from_checkpoint = '/content/drive/My Drive/P_IA376E_2020S1/Class-8 BERT/TASK/logs/Electra-400/Electra-400-epoch=37-val_loss=0.18.ckpt'
resume_from_checkpoint= False
# Configuração do Early Stop
early_stop = EarlyStopping(monitor="val_loss",  
                           patience=hyperparameters["patience"], 
                           verbose=False, 
                           mode='min'  
                           )
logger = TensorBoardLogger(hyperparameters["experiment_name"],name='T5' ,version="NEGATIVE_1")

# Lighting Trainer
trainer = pl.Trainer(gpus=1,
                     logger=logger,
                     max_epochs=hyperparameters["max_epochs"],
                     check_val_every_n_epoch=1,
                     accumulate_grad_batches=5,
                     checkpoint_callback=checkpoint_callback,
#                       resume_from_checkpoint=resume_from_checkpoint,
                     amp_level='O2', use_amp=False)
hparams = Namespace(**hyperparameters)
model = T5Finetuner(hparams=hparams,train_dataloader=train_dataloader,val_dataloader=val_dataloader,test_dataloader=test_dataloader)

# Train
trainer.fit(model)

GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | lossfunc                                                              | CrossEntropyLoss           | 0     
1   | model                                                                 | T5ForConditionalGeneration | 222 M 
2   | model.shared                                                          | Embedding                  | 24 M  
3   | model.encoder                                                         | T5Stack                    | 109 M 
4   | model.encoder.block                                                   | ModuleList                 | 84 M  
5   | model.encoder.block.0                                                 | T5Block 

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

## Test model on Capes dataset

In [6]:
trainer.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[30745   867]
 [  867 14939]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     31612
           1       0.95      0.95      0.95     15806

    accuracy                           0.96     47418
   macro avg       0.96      0.96      0.96     47418
weighted avg       0.96      0.96      0.96     47418

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.9451474123750474}
--------------------------------------------------------------------------------



-----

# Test model on Scielo dataset 

In [26]:

# T5 tokenizer with portuguese chars
tokenizer = port_tokenizer

max_length =  200
batch_size = 128

scielo_dataset = CLPDDataset(name='scielo',data_type='test',n_negatives=1,max_length=200)
scielo_dataset = scielo_dataset.get_organized_data(tokenizer=tokenizer,tokenizer_type='t5')
scielo_dataloader = DataLoader(scielo_dataset, batch_size=batch_size,
                              shuffle=False, num_workers=4)


HBox(children=(FloatProgress(value=0.0, description='Processing SCIELO TEST', max=9956.0, style=ProgressStyle(…




In [27]:
# Hiperparameters

hyperparameters = {"experiment_name":  "T5-CAPES",
                   "version": 'TEST-ON-SCIELO',
                   "optimizer": torch.optim.Adam,
                   "target_max_length": 3,
                   "max_epochs": 3,
                   "patience": 4,
                    "steplr_epochs":1,
                    "scheduling_factor": 0.95,
                    "learning_rate": 6e-5,
                    "max_length": max_length,
                    'batch_size': batch_size
                   }
#------------------------------#
#       Checkpoints            #
#------------------------------#

# Resume from checkpoint Hard coded 
resume_from_checkpoint= '/work/src/T5/logs/T5-CAPES/T5-CAPES-epoch=2-val_f1=0.97.ckpt'
# Logger
logger = TensorBoardLogger(hyperparameters["experiment_name"], name='T5' ,version=hyperparameters['version'])

# Lighting Tester
tester = pl.Trainer(gpus=1,
                     logger=logger,
                     resume_from_checkpoint=resume_from_checkpoint,
                     amp_level='O2', use_amp=False)
hparams = Namespace(**hyperparameters)
model = T5Finetuner(hparams=hparams,train_dataloader=None,val_dataloader=None,test_dataloader=scielo_dataloader)


GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


In [28]:
tester.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[38196  1628]
 [  640 19272]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     39824
           1       0.92      0.97      0.94     19912

    accuracy                           0.96     59736
   macro avg       0.95      0.96      0.96     59736
weighted avg       0.96      0.96      0.96     59736

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.9444281093795943}
--------------------------------------------------------------------------------



-----

## Test on books dataset

In [4]:

# Bert Multilingual Tokenizer
tokenizer = port_tokenizer

max_length =  200
batch_size = 300

books_dataset = CLPDDataset(name='books',data_type='test')

books_dataset = books_dataset.get_organized_data(tokenizer=tokenizer,tokenizer_type='t5')
books_dataloader = DataLoader(books_dataset, batch_size=batch_size,
                              shuffle=False, num_workers=4)


HBox(children=(FloatProgress(value=0.0, description='Processing BOOKS TEST', max=600.0, style=ProgressStyle(de…




In [5]:
# Hiperparameters

hyperparameters = {"experiment_name":  "T5-CAPES",
                   "version": 'TEST-ON-BOOKS',
                   "optimizer": torch.optim.Adam,
                   "target_max_length": 3,
                   "max_epochs": 3,
                   "patience": 4,
                    "steplr_epochs":1,
                    "scheduling_factor": 0.95,
                    "learning_rate": 6e-5,
                    "max_length": max_length,
                    'batch_size': batch_size
                   }
#------------------------------#
#       Checkpoints            #
#------------------------------#

# Resume from checkpoint Hard coded 
resume_from_checkpoint= '/work/src/T5/logs/T5-CAPES/T5-CAPES-epoch=2-val_f1=0.97.ckpt'
# Logger
logger = TensorBoardLogger(hyperparameters["experiment_name"], name='T5' ,version=hyperparameters['version'])

# Lighting Tester
tester = pl.Trainer(gpus=1,
                     logger=logger,
                     resume_from_checkpoint=resume_from_checkpoint,
                     amp_level='O2', use_amp=False)
hparams = Namespace(**hyperparameters)
model = T5Finetuner(hparams=hparams,train_dataloader=None,val_dataloader=None,test_dataloader=books_dataloader)


GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


In [6]:
tester.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[1472  928]
 [  35 2365]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       0.98      0.61      0.75      2400
           1       0.72      0.99      0.83      2400

    accuracy                           0.80      4800
   macro avg       0.85      0.80      0.79      4800
weighted avg       0.85      0.80      0.79      4800

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.8308448972422273}
--------------------------------------------------------------------------------

