# Bert for Cross-Language Plagiarism Detection

Author: João Phillipe Cardenuto

In this notebook we implement a model regarding the Detailed Analysis of the CLPD.

# Import Libraries

In [2]:
! pip install -q pytorch-lightning

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
# Comum libraries
import os
import random
from typing import Dict
from typing import List
import numpy as np
import pandas as pd
import re
from argparse import Namespace
from tqdm.notebook  import trange, tqdm_notebook

# Dataset
import sys
sys.path.insert(0, "/work/src/DataloaderCLPD/")
from LoadDataset import *

# Torch
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

# Sklearn
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Tersorflow
import tensorboard
%load_ext tensorboard


# stopWords
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

from string import punctuation

# Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

# Setup seeds
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu" 

device = torch.device(dev)
print("Device",dev)

if "cuda" in dev:
    print("GPU: ", torch.cuda.get_device_name(0))



## Loading Data

Using LoadDataset module to load capes dataset

In [2]:
from torchtext.vocab import GloVe
# glove_vectors is a pytorch vocabulary
# https://github.com/pytorch/text/blob/master/torchtext/vocab.py
glove_vectors = GloVe(name='42B', dim=300, cache='./glove_dir')

glove_dict = glove_vectors.stoi
stopwords_eng = set(stopwords.words('english') + list(punctuation))
stopwords_pt =  set(stopwords.words('portuguese') + list(punctuation))

In [24]:

def generate_batch(batch):
    """
    # REF https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
    """
    label = [entry[2] for entry in batch]
    pairs = [entry[3] for entry in batch]
    sent1_embed = [entry[0] for entry in batch]
    sent2_embed = [entry[1] for entry in batch]
    
    
    offsets_1 = [0] + [len(entry) for entry in sent1_embed ]
    offsets_2 = [0] + [len(entry) for entry in sent2_embed ]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets_1 = torch.tensor(offsets_1[:-1]).cumsum(dim=0)
    offsets_2 = torch.tensor(offsets_2[:-1]).cumsum(dim=0)
    sent1_embed = torch.cat(sent1_embed)
    sent2_embed = torch.cat(sent2_embed)
    return sent1_embed, offsets_1, sent2_embed, offsets_2, label, pairs

### Testing data

In [4]:
# We using sample size of x it represent x*(2*N_negatives + 2) = x*(ENG + PT + ENG_NEGATIVE_1 ... + ENG_NEGATIVE_N +
#                                                                            PT_NEGATIVE_1 + ... + PT_NEGATIVE_N)
train_capes, val_capes = CLPDDataset(name='capes',data_type='train',sample_size=3000,val_size=0.2,n_negatives=1).get_word2vec_pairs(
                                                            glove_dict,stopwords_eng,stopwords_pt)



Loading CAPES TRAIN: 100%|██████████| 2400/2400 [00:00<00:00, 3142.71it/s]
100%|██████████| 7200/7200 [00:00<00:00, 113435.79it/s]
Loading CAPES VALIDATION: 100%|██████████| 600/600 [00:00<00:00, 3476.95it/s]
100%|██████████| 1800/1800 [00:00<00:00, 57665.55it/s]


In [5]:
train_capes[2]

(tensor([ 3143,   393,  4729,  2654,   554, 19627,   270,   131,   744,  1012,
          3750, 19627,   270,  1145,  5641,  3750,  9914]),
 tensor([ 288036,   91089,  291234, 1599399,  312181,  328994,  182834,   26664,
          226598,  691353,    9034,    4492]),
 tensor(0),
 'PT: o objetivo do presente estudo foi avaliar os efeitos do treinamento de força e da suplementação de tributirina sobre os parâmetros da caquexia em animais inoculados com células do tumor de walker 256.\nNEGATIVE_1_ENG: besides the results confirm the influence of education on adherence to health plans, this work shows an increase of 5% in the percentage of adherence to health plans if there is a decrease of 10% in the percentage of pe')

In [6]:
# Samples
print(train_capes[2],"\n", train_capes.pairs[2])
print("Number of Samples:", len(train_capes.pairs))

(tensor([ 3143,   393,  4729,  2654,   554, 19627,   270,   131,   744,  1012,
         3750, 19627,   270,  1145,  5641,  3750,  9914]), tensor([ 288036,   91089,  291234, 1599399,  312181,  328994,  182834,   26664,
         226598,  691353,    9034,    4492]), tensor(0), 'PT: o objetivo do presente estudo foi avaliar os efeitos do treinamento de força e da suplementação de tributirina sobre os parâmetros da caquexia em animais inoculados com células do tumor de walker 256.\nNEGATIVE_1_ENG: besides the results confirm the influence of education on adherence to health plans, this work shows an increase of 5% in the percentage of adherence to health plans if there is a decrease of 10% in the percentage of pe') 
 PT: o objetivo do presente estudo foi avaliar os efeitos do treinamento de força e da suplementação de tributirina sobre os parâmetros da caquexia em animais inoculados com células do tumor de walker 256.
NEGATIVE_1_ENG: besides the results confirm the influence of education on

# Model

In [7]:
class ModelGloVe(torch.nn.Module):

    def __init__(self,embed_dim, hidden_units1,hidden_units2,
                 embed_vector):
        """
        A CLPD baseline using word2vec approach
        """

        super(ModelGloVe, self).__init__()
        # Cria um EmbeddingBag que fara a soma de cada Bag usando os embedding
        # pretreinadso
        self.embedding = torch.nn.EmbeddingBag.from_pretrained(embed_vector,
                                                               freeze=True)
        self.layer1 = torch.nn.Linear(in_features=2*embed_dim, out_features=hidden_units1)   
        self.layer2 = torch.nn.Linear(in_features=hidden_units1, out_features=hidden_units2)
        self.layer3 = torch.nn.Linear(in_features=hidden_units2, out_features=2)
        self.device = "cpu"
    

    def to (self,device):
        """
        Insert all mode to device
        """
        self.device = device
        self = super(ModelGloVe,self).to(device)
        
        return self


    def forward(self, sent1, offset1, sent2, offset2 ):
                
        embedded_1 = self.embedding(sent1,offset1)
        embedded_2 = self.embedding(sent2,offset2)
        embedd_pair = torch.cat((embedded_1, embedded_2), axis=1)
        x = self.layer1(embedd_pair)
        x = F.relu(x)
        x = self.layer2(x)
        x = F.relu(x)
        x = self.layer3(x)
        
        return x


    

Doc BertModel:
https://huggingface.co/transformers/model_doc/bert.html#bertmodel

In [8]:
class GloveCLPD(pl.LightningModule):

    def __init__(self, hparams=None,train_dataloader=None,val_dataloader=None,test_dataloader=None):
        
        super(GloveCLPD, self).__init__()
        
        #Hiperparameters
        if hparams:
            self.hparams = hparams
             # Learnning Rate and Loss Function
            self.learning_rate = hparams.learning_rate
            self.lossfunc = torch.nn.CrossEntropyLoss()
            # Optimizer
            self.optimizer = eval(self.hparams.optimizer)

            # Glove Model
            self.model = ModelGloVe(glove_vectors.vectors.shape[1],
                        hparams.HIDDEN_UNITS_1,
                        hparams.HIDDEN_UNITS_2,
                        glove_vectors.vectors)

        # Dataloaders
        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader
       


    def forward(self, sent1, offsets_1, sent2, offsets_2):
       
        # If labels are None, It will return a loss and a logit
        # Else it return the predicted logits for each sentence
        return self.model(sent1, offsets_1, sent2, offsets_2)

    def training_step(self, batch, batch_nb):
        # batch
        sent1, offsets_1, sent2, offsets_2, labels,_ = batch
         
        # fwd
        y_hat = self(sent1.to(device), offsets_1.to(device), sent2.to(device), offsets_2.to(device))
        
        # loss
        loss = self.lossfunc(y_hat, labels.to(device)) # Using loss from the model
        
        # logs
        tensorboard_logs = {'train_loss': loss.item()}
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_nb):
        # batch
        sent1, offsets_1, sent2, offsets_2, labels, _ = batch
         
        # fwd
        y_hat = self(sent1.to(device), offsets_1.to(device), sent2.to(device), offsets_2.to(device))
        
        # loss
        loss = self.lossfunc(y_hat, labels.to(device)) # Using loss from the model
        
        # F1 -score
        _, y_hat = torch.max(y_hat, dim=1)
        val_f1 = f1_score(y_pred=y_hat.cpu(), y_true=labels.cpu())
        val_f1 = torch.tensor(val_f1)
        
        return {'val_loss': loss, 'val_f1': val_f1}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_val_f1 = torch.stack([x['val_f1'] for x in outputs]).mean()

        tensorboard_logs = {'val_loss': avg_loss, 'val_f1': avg_val_f1}
        
        return {'val_loss': avg_loss.item(), 'val_f1': avg_val_f1.item(), 
                'progress_bar': tensorboard_logs, "log": tensorboard_logs}

    def test_step(self, batch, batch_nb):
        # batch
        sent1, offsets_1, sent2, offsets_2, labels, pairs = batch
         
        # fwd
        y_hat = self(sent1.to(device), offsets_1.to(device), sent2.to(device), offsets_2.to(device))
        _, y_hat = torch.max(y_hat, dim=1)
        

        return {'pairs': pairs, 'y_true': labels.cpu(), 'y_pred':y_hat.cpu() }

    def test_epoch_end(self, outputs):
        
        
        pairs = [pair for x in outputs for pair in x['pairs']]
        y_true = np.array([ y.item() for x in outputs for y in x['y_true'] ])
        y_pred = np.array([ y.item() for x in outputs for y in x['y_pred'] ])
        
        # Write failure on file
        with open (f"{self.log_path}/FAILURE_TESTSET_{self.testset_name}.txt", 'w') as file:
               for index,pair in enumerate(pairs):
                    if y_true[index] != y_pred[index]:
                        file.write("="*50+f"\n[Y_TRUE={y_true[index]} != Y_PRED={y_pred[index]}]\n"+pair \
                                  +'\n'+"="*50+'\n')
                        
        with open (f"{self.log_path}/METRICS_TESTSET_{self.testset_name}.txt", 'w') as file:
                file.write("="*50+"\n"+
                           "\t\t"+self.testset_name.upper()+"\n"+
                           "="*50+"\n\n\n"+
                           "-"*50+"\n"+
                           "CONFUSION MATRIX:\n"+
                           f'{confusion_matrix(y_true=y_true, y_pred=y_pred)}\n\n'+
                           "-"*50+"\n"+
                           "SKLEARN REPORT:\n"+
                           f'{classification_report(y_true=y_true, y_pred=y_pred)}\n\n'+
                           "-"*50+"\n"+
                           f"F1-SCORE: {f1_score(y_pred=y_pred, y_true=y_true)}\n\n"+
                           "="*50+"\n")
                           
        
        print("CONFUSION MATRIX:")
        print(confusion_matrix(y_true=y_true, y_pred=y_pred))
        
        print("SKLEARN  REPORT")
        print(classification_report(y_true=y_true, y_pred=y_pred))
        
        
        test_f1 =  f1_score(y_pred=y_pred, y_true=y_true)
    
        tensorboard_logs = {'test_f1': test_f1}
        return {'test_f1': test_f1, 'log': tensorboard_logs,
                 'progress_bar': tensorboard_logs}
    
    def configure_optimizers(self):

        optimizer =  self.optimizer(
            [p for p in self.parameters() if p.requires_grad],
            lr=self.learning_rate)
        
        scheduler = StepLR(optimizer, step_size=self.hparams.steplr_epochs, gamma=self.hparams.scheduling_factor)

        return [optimizer], [scheduler]

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

In [25]:
def get_all_dataloaders(train_dataset_name,
                        max_length,
                        val_size,
                        sample_size,
                        n_negatives,
                        batch_size,
                        ):
     
    train_clpd = CLPDDataset(name=train_dataset_name,
                            data_type='train',
                            sample_size=sample_size,
                            val_size=val_size,
                            max_length= max_length,
                            n_negatives=n_negatives)
    
    trainset , valset = train_clpd.get_word2vec_pairs(glove_dict,stopwords_eng,stopwords_pt)
    
    capes_testset = CLPDDataset(name='capes', data_type='test', max_length= max_length).get_word2vec_pairs(glove_dict,stopwords_eng,stopwords_pt)
    
    scielo_testset = CLPDDataset(name='scielo', data_type='test', max_length= max_length).get_word2vec_pairs(glove_dict,stopwords_eng,stopwords_pt)
    
    books_testset = CLPDDataset(name='books', data_type='test', max_length= max_length).get_word2vec_pairs(glove_dict,stopwords_eng,stopwords_pt)
    
    train_dataloader = DataLoader(trainset, batch_size=batch_size,
                                  shuffle=True, num_workers=4, collate_fn=generate_batch)
    
    val_dataloader = DataLoader(valset, batch_size=batch_size,
                                  shuffle=False, num_workers=4,collate_fn=generate_batch)
    
    capes_dataloader = DataLoader(capes_testset, batch_size=batch_size,
                                  shuffle=False, num_workers=4, collate_fn=generate_batch)
    
    scielo_dataloader = DataLoader(scielo_testset, batch_size=batch_size,
                                  shuffle=False, num_workers=4, collate_fn=generate_batch)
    
    books_dataloader = DataLoader(books_testset, batch_size=batch_size,
                                  shuffle=False, num_workers=4, collate_fn=generate_batch)
    
    
    return train_dataloader, val_dataloader , capes_dataloader , scielo_dataloader, books_dataloader


In [10]:
train_clpd = CLPDDataset(name='capes',
                            data_type='train',
                            sample_size=3000,
                            val_size=0.2,
                            max_length= 200,
                            n_negatives=1)
    
trainset , valset = train_clpd.get_word2vec_pairs(glove_dict,stopwords_eng,stopwords_pt)

Loading CAPES TRAIN: 100%|██████████| 2400/2400 [00:00<00:00, 2558.72it/s]
100%|██████████| 7200/7200 [00:00<00:00, 93892.40it/s]
Loading CAPES VALIDATION: 100%|██████████| 600/600 [00:00<00:00, 2771.49it/s]
100%|██████████| 1800/1800 [00:00<00:00, 72574.88it/s]


In [14]:
generate_batch( trainset[0:10])


(tensor([   3667,     656,    5858,    1122,    2035,     548,   22794, 1481177,
            5160,  159880,    1836,   68019,    9034,    2130,    4492,    9474,
            3667,     656,    5858,    1122,    2035,     548,   22794, 1481177,
            5160,  159880,    1836,   68019,    9034,    2130,    4492,    9474,
            3143,     393,    4729,    2654,     554,   19627,     270,     131,
             744,    1012,    3750,   19627,     270,    1145,    5641,    3750,
            9914,     131,    1299,     800,     962,    1347,     656,    4896,
            1228,    4686,  253728,     131,   55498,  762979,     388,  154725,
           34413,     473,    4686,   11487,     131,    1299,     800,     962,
            1347,     656,    4896,    1228,    4686,  253728,     131,   55498,
          762979,     388,  154725,   34413,     473,    4686,   11487,    3863,
            3072,     157,    8399,   23712,    2275,   11504,    2916,   23712,
           11504,    8989,  

In [63]:
for i in eng: print( len(i))

16
16


In [19]:
# sent1_embed, offsets_1, sent2_embed, offsets_2, labels, _ = a
# modelGlove(sent1_embed, offsets_1, sent2_embed, offsets_2)

# Fast test
HIDDEN_UNITS_1 = 256
HIDDEN_UNITS_2 = 64
model = ModelGloVe(glove_vectors.vectors.shape[1],
                        HIDDEN_UNITS_1,HIDDEN_UNITS_2,
                        glove_vectors.vectors)

In [45]:
capes_testset = CLPDDataset(name='capes', data_type='test', max_length= 200).get_word2vec_pairs(glove_dict,stopwords_eng,stopwords_pt)


Loading CAPES TEST: 100%|██████████| 7903/7903 [00:04<00:00, 1805.77it/s]


In [50]:
for i in capes_testset[0:2][0]:
    print(i.shape)

torch.Size([24])
torch.Size([24])


In [27]:
next(iter(val_loader))

(tensor([   1048,    2977,    2717,    4146,   13974,    4383,     401,    1921,
             449,    7098,  244555,   10928,     401,    1101,  411607,    1048,
            2977,    2717,    4146,   13974,    4383,     401,    1921,     449,
            7098,  244555,   10928,     401,    1101,  411607,    3773,     397,
             656,    6132,    2171,   13974,    5191,   19465,     655,     184,
             194,    6898,    1339,   73331,    4055,   18140,     856,   35867,
          497847,    1144,     425,   20888,     289,    2496,     376,  359469,
            5663,    1752,    3189,     863,     238,      50,     548,    1584,
          683489,    1948,    1559,    3248,     239,     554,     427,    5836,
            4025,    2990, 1405608,  359469,    5663,    1752,    3189,     863,
             238,      50,     548,    1584,  683489,    1948,    1559,    3248,
             239,     554,     427,    5836,    4025,    2990, 1405608,   22951,
             313,    6017,  

In [26]:
hyperparameters = {
                    "experiment_name": "CAPES-GloVe", 
                    "max_epochs": 2,
                    "optimizer": 'torch.optim.Adam',
                    "patience": 1,
                    "steplr_epochs":1,
                    "scheduling_factor": 0.9,
                    "learning_rate": 1e-5,
                    "max_length":200,
                    "batch_size":10,
                    "HIDDEN_UNITS_1": 512,
                    "HIDDEN_UNITS_2": 64,
                    'gpu': 0,
                    'trainset': 'capes',
                    'trainset_len': 1000,
                    'val_size': 0.2,
                    'freeze': True
                   }

n_negatives = hyperparameters['n_negatives'] = 1
experiment_name = hyperparameters['experiment_name'].replace("/",'_')
hyperparameters['experiment_name'] = f'{experiment_name}_GLOVE_N_{n_negatives}'


train_loader, val_loader , capes_loader , scielo_loader, books_loader = get_all_dataloaders(
                                                                    train_dataset_name=hyperparameters['trainset'] ,
                                                                    max_length=hyperparameters['max_length'],
                                                                    val_size=hyperparameters['val_size'],
                                                                    sample_size=hyperparameters['trainset_len'],
                                                                    n_negatives=hyperparameters['n_negatives'],
                                                                    batch_size=hyperparameters['batch_size'])
hparams = Namespace(**hyperparameters)
model = GloveCLPD(hparams=hparams,train_dataloader=train_loader,val_dataloader=val_loader, test_dataloader=None)


Loading CAPES TRAIN: 100%|██████████| 800/800 [00:00<00:00, 2450.29it/s]
100%|██████████| 2400/2400 [00:00<00:00, 59200.47it/s]
Loading CAPES VALIDATION: 100%|██████████| 200/200 [00:00<00:00, 2523.60it/s]
100%|██████████| 600/600 [00:00<00:00, 52552.52it/s]
Loading CAPES TEST: 100%|██████████| 7903/7903 [00:03<00:00, 2231.67it/s]
100%|██████████| 39515/39515 [00:00<00:00, 115675.52it/s]
Loading SCIELO TEST: 100%|██████████| 9956/9956 [00:05<00:00, 1952.16it/s]
100%|██████████| 49780/49780 [00:00<00:00, 121353.43it/s]
Processing BOOKS TEST: 100%|██████████| 600/600 [00:00<00:00, 1333.18it/s]
100%|██████████| 4200/4200 [00:00<00:00, 78124.92it/s]


##  Number of Parameter 

In [39]:
sum([torch.tensor(x.size()).prod() for x in model.parameters() if x.requires_grad]) # trainable parameters

tensor(340674)

## Fast dev run

In [28]:
trainer = pl.Trainer( gpus=None,
                     logger=False,
                     checkpoint_callback=False,  # Disable checkpoint saving.
                     fast_dev_run=True,
                     amp_level='O2', use_amp=False
)
trainer.fit(model)
trainer.test(model)
del model 

Running in fast_dev_run mode: will run a full train, val and test loop using a single batch
GPU available: True, used: False
No environment variable for node rank defined. Set as 0.

  | Name            | Type             | Params
-------------------------------------------------
0 | lossfunc        | CrossEntropyLoss | 0     
1 | model           | ModelGloVe       | 575 M 
2 | model.embedding | EmbeddingBag     | 575 M 
3 | model.layer1    | Linear           | 307 K 
4 | model.layer2    | Linear           | 32 K  
5 | model.layer3    | Linear           | 130   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

RuntimeError: Tensor for argument #1 'weight' is on CPU, but expected it to be on GPU (while checking arguments for embedding_bag_cuda)

In [None]:
hyperparameters = {
                    "experiment_name": "CAPES-GloVe", 
                    "max_epochs": 2,
                    "optimizer": 'torch.optim.Adam',
                    "patience": 1,
                    "steplr_epochs":1,
                    "scheduling_factor": 0.9,
                    "learning_rate": 1e-5,
                    "max_length":200,
                    "batch_size":500,
                    "HIDDEN_UNITS_1": 512,
                    "HIDDEN_UNITS_2": 64,
                    'gpu': 0,
                    'trainset': 'capes',
                    'trainset_len': 200000,
                    'val_size': 0.2,
                    'freeze': True
                   }

# N_negative First arg
# BertModel Second arg
n_negatives = int(sys.argv[1])
bert_model = sys.argv[2]
hyperparameters['model'] = bert_model
hyperparameters['n_negatives'] = n_negatives
# Bert  Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(bert_model)


train_loader, val_loader , capes_loader , scielo_loader, books_loader = get_all_dataloaders(
                                                                    train_dataset_name=hyperparameters['trainset'] ,
                                                                    max_length=hyperparameters['max_length'],
                                                                    val_size=hyperparameters['val_size'],
                                                                    sample_size=hyperparameters['trainset_len'],
                                                                    n_negatives=hyperparameters['n_negatives'],
                                                                    batch_size=hyperparameters['batch_size'],
                                                                    tokenizer=tokenizer)

experiment_name = hyperparameters['experiment_name'].replace("/",'_')
hyperparameters['experiment_name'] = f'{experiment_name}_{bert_model}_N_{n_negatives}'


In [20]:
------------------------------#
#       Checkpoints / LOG      #
#------------------------------#

log_path = 'logs'
ckpt_path = os.path.join(log_path, hyperparameters["experiment_name"], "-{val_loss:.2f}")  
checkpoint_callback = ModelCheckpoint(prefix="checkpoint",  # prefixo para nome do checkpoint
                                      filepath=ckpt_path,  # path onde será salvo o checkpoint
                                      monitor="val_loss", 
                                      mode="min",
                                      save_top_k=1)   
# Hard coded
logger_path = os.path.join(log_path, hyperparameters["experiment_name"])
logger = TensorBoardLogger(logger_path,name='Tensorboard_logger')

# Lighting Trainer
trainer = pl.Trainer(gpus=[hyperparameters['gpu']],
                     logger=logger,
                     max_epochs=hyperparameters["max_epochs"],
                     check_val_every_n_epoch=1,
                     accumulate_grad_batches=2,
                     checkpoint_callback=checkpoint_callback,
                     amp_level='O2', use_amp=False)
hparams = Namespace(**hyperparameters)
model = BertFinetuner(hparams=hparams,train_dataloader=train_loader,val_dataloader=val_loader, test_dataloader=None)

# Train
trainer.fit(model)




In [21]:
#------------------------------#
#            TEST              #
#------------------------------#

# Get Checkpoints path
checkpoint = glob(f'{trainer.weights_save_path}/checkpoint*')
checkpoint.sort()
checkpoint = checkpoint[0]

model.log_path = trainer.weights_save_path

# Books
model.testset_name = 'books'
tester_books =  pl.Trainer(gpus=[hyperparameters['gpu']],amp_level='O2', use_amp=False)
tester_books.test(model=model,test_dataloaders=books_loader)

# CAPES
model.testset_name = 'capes'
tester_capes =  pl.Trainer(gpus=[hyperparameters['gpu']],amp_level='O2', use_amp=False)
tester_capes.test(model=model,test_dataloaders=capes_loader)

# Scielo
model.testset_name = 'scielo'
tester_scielo =  pl.Trainer(gpus=[hyperparameters['gpu']],amp_level='O2', use_amp=False)
tester_scielo.test(model=model,test_dataloaders=scielo_loader)


## Overfit on a Batch

We notice that easily the model can overfit on a batch

In [24]:
hyperparameters = {"experiment_name": "BertCLPD", 
                   "optimizer": torch.optim.Adam,
                   "max_epochs": 5,
                   "patience": 4,
                    "steplr_epochs":5,
                    "scheduling_factor": 0.95,
                    "learning_rate": 6e-5,
                    "max_length": 400
                   }
trainer = pl.Trainer(gpus=1,
                     logger=False,
                     max_epochs=hyperparameters['max_epochs'],
                     check_val_every_n_epoch=5,
                     checkpoint_callback=False,  # Disable checkpoint saving
                     overfit_pct=0.5,
                     amp_level='O2', use_amp=False)

model = BertFinetuner(hparams=Namespace(**hyperparameters),
                      train_dataloader=val_dataloader,
                      val_dataloader=val_dataloader,
                      test_dataloader=val_dataloader)

trainer.fit(model)
trainer.test(model)


del model  
del trainer

GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                                   | Type                          | Params
-----------------------------------------------------------------------------------------------------
0   | lossfunc                                               | CrossEntropyLoss              | 0     
1   | model                                                  | BertForSequenceClassification | 177 M 
2   | model.bert                                             | BertModel                     | 177 M 
3   | model.bert.embeddings                                  | BertEmbeddings                | 92 M  
4   | model.bert.embeddings.word_embeddings                  | Embedding                     | 91 M  
5   | model.bert.embeddings.position_embeddings              | Embedding                     | 393 K 
6   | model.bert.embeddings.token_type_embeddings            | Embed

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Detected KeyboardInterrupt, attempting graceful shutdown...





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_f1': tensor(1., dtype=torch.float64)}
--------------------------------------------------------------------------------



## Training

In [7]:
# Training will perform a cross-dataset.
# Training on Capes testing on SciElo


max_length =  200
capes_dataset = CLPDDataset(name='capes',data_type='train',sample_size=40000,val_size=0.2,max_length=200,n_negatives=1)
capes_test = CLPDDataset(name='capes',data_type='test',n_negatives=1,max_length=200)

# Bert Multilingual Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

# Traning data 
train_capes, val_capes = capes_dataset.get_organized_data(tokenizer=tokenizer)
test_capes = capes_test.get_organized_data(tokenizer=tokenizer)


HBox(children=(FloatProgress(value=0.0, description='Processing CAPES TRAIN', max=32000.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='Processing CAPES VALIDATION', max=8000.0, style=ProgressS…




HBox(children=(FloatProgress(value=0.0, description='Processing CAPES TEST', max=7903.0, style=ProgressStyle(d…




In [5]:
len(test_capes)

47418

In [8]:
#------tester-----------#
#   DataLoaders     #
#-------------------#

batch_size = 128

train_dataloader = DataLoader(train_capes, batch_size=batch_size,
                              shuffle=True, num_workers=4)

val_dataloader = DataLoader(val_capes, batch_size=batch_size, shuffle=False, 
                            num_workers=4)

test_dataloader = DataLoader(test_capes, batch_size=batch_size,
                             shuffle=False, num_workers=4)



# Hiperparameters
hyperparameters = {"experiment_name": "BERT-CAPES", 
                   "max_epochs": 2,
                    "optimizer": torch.optim.Adam,
                    "patience": 1,
                     "steplr_epochs":1,
                    "scheduling_factor": 0.9,
                    "learning_rate": 1e-5,
                   "max_length":max_length,
                   "batch_size":batch_size,
                   }
#------------------------------#
#       Checkpoints            #
#------------------------------#

log_path = 'logs'
ckpt_path = os.path.join(log_path, hyperparameters["experiment_name"], "-{epoch}-{val_loss:.2f}")  
checkpoint_callback = ModelCheckpoint(prefix=hyperparameters["experiment_name"],  # prefixo para nome do checkpoint
                                      filepath=ckpt_path,  # path onde será salvo o checkpoint
                                      monitor="val_loss", 
                                      mode="min",
                                      save_top_k=2)   
# Hard coded
# resume_from_checkpoint = '/content/drive/My Drive/P_IA376E_2020S1/Class-8 BERT/TASK/logs/Electra-400/Electra-400-epoch=37-val_loss=0.18.ckpt'
resume_from_checkpoint= False
# Configuração do Early Stop
early_stop = EarlyStopping(monitor="val_loss",  
                           patience=hyperparameters["patience"], 
                           verbose=False, 
                           mode='min'  
                           )
logger = TensorBoardLogger(hyperparameters["experiment_name"], name="BASELINE")

# Lighting Trainer
trainer = pl.Trainer(gpus=1,
                     logger=logger,
                     max_epochs=hyperparameters["max_epochs"],
                     check_val_every_n_epoch=1,
                     accumulate_grad_batches=2,
                     checkpoint_callback=checkpoint_callback,
#                       resume_from_checkpoint=resume_from_checkpoint,
                     amp_level='O2', use_amp=False)
hparams = Namespace(**hyperparameters)
model = BertFinetuner(hparams=hparams,train_dataloader=train_dataloader,val_dataloader=val_dataloader,test_dataloader=test_dataloader)

# Train
trainer.fit(model)

GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                                   | Type                          | Params
-----------------------------------------------------------------------------------------------------
0   | lossfunc                                               | CrossEntropyLoss              | 0     
1   | model                                                  | BertForSequenceClassification | 177 M 
2   | model.bert                                             | BertModel                     | 177 M 
3   | model.bert.embeddings                                  | BertEmbeddings                | 92 M  
4   | model.bert.embeddings.word_embeddings                  | Embedding                     | 91 M  
5   | model.bert.embeddings.position_embeddings              | Embedding                     | 393 K 
6   | model.bert.embeddings.token_type_embeddings            | Embed

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…






1

## Test model on Capes dataset

In [9]:
trainer.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[31074   538]
 [  109 15697]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     31612
           1       0.97      0.99      0.98     15806

    accuracy                           0.99     47418
   macro avg       0.98      0.99      0.98     47418
weighted avg       0.99      0.99      0.99     47418

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.9798071221247776}
--------------------------------------------------------------------------------



-----

# Test model on Scielo dataset 

In [10]:

# Bert Multilingual Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

max_length =  200
batch_size = 128

scielo_dataset = CLPDDataset(name='scielo',data_type='test',n_negatives=1,max_length=200)
scielo_dataset = scielo_dataset.get_organized_data(tokenizer)
scielo_dataloader = DataLoader(scielo_dataset, batch_size=batch_size,
                              shuffle=False, num_workers=4)


HBox(children=(FloatProgress(value=0.0, description='Processing SCIELO TEST', max=9956.0, style=ProgressStyle(…




In [11]:
# Hiperparameters
hyperparameters = {"experiment_name": "BERT-CAPES-TEST-ON-SCIELO", 
                   "max_epochs": 1,
                    "optimizer": torch.optim.Adam,
                    "patience": 1,
                     "steplr_epochs":1,
                    "scheduling_factor": 0.9, # argumentos de otimização
                    "learning_rate": 1e-5,
                   "max_length":max_length,
                   "batch_size":batch_size,
                   }
#------------------------------#
#       Checkpoints            #
#------------------------------#

log_path = 'logs'
ckpt_path = os.path.join(log_path, hyperparameters["experiment_name"], "-{epoch}-{val_loss:.2f}")  
# Resume from checkpoint Hard coded 
resume_from_checkpoint= '/work/src/Bert-Baseline/logs/BERT-CAPES/BERT-CAPES-epoch=1-val_loss=0.01.ckpt'
# Logger
logger = TensorBoardLogger(hyperparameters["experiment_name"], name="BASELINE")

# Lighting Tester
tester = pl.Trainer(gpus=1,
                     logger=logger,
                     resume_from_checkpoint=resume_from_checkpoint,
                     amp_level='O2', use_amp=False)
hparams = Namespace(**hyperparameters)
model = BertFinetuner(hparams=hparams,train_dataloader=None,val_dataloader=None,test_dataloader=scielo_dataloader)


GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


In [12]:
tester.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[35937  3887]
 [   10 19902]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       1.00      0.90      0.95     39824
           1       0.84      1.00      0.91     19912

    accuracy                           0.93     59736
   macro avg       0.92      0.95      0.93     59736
weighted avg       0.95      0.93      0.94     59736

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.910825839225647}
--------------------------------------------------------------------------------



-----

## Test on books dataset

In [13]:

# Bert Multilingual Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

max_length =  200
batch_size = 300

books_dataset = CLPDDataset(name='books',data_type='test')

books_dataset = books_dataset.get_organized_data(tokenizer)
books_dataloader = DataLoader(books_dataset, batch_size=batch_size,
                              shuffle=False, num_workers=4)


HBox(children=(FloatProgress(value=0.0, description='Processing BOOKS TEST', max=600.0, style=ProgressStyle(de…




In [14]:
# Hiperparameters
hyperparameters = {"experiment_name": "BERT-CAPES-TEST-ON-BOOKS", 
                   "max_epochs": 5,
                    "optimizer": torch.optim.Adam,
                    "patience": 1,
                     "steplr_epochs":1,
                    "scheduling_factor": 0.9, # argumentos de otimização
                    "learning_rate": 1e-5,
                   "max_length":max_length,
                   "batch_size":batch_size,
                   }
#------------------------------#
#       Checkpoints            #
#------------------------------#

log_path = 'logs'
ckpt_path = os.path.join(log_path, hyperparameters["experiment_name"], "-{epoch}-{val_loss:.2f}")  
# Resume from checkpoint Hard coded 
resume_from_checkpoint= '/work/src/Bert-Baseline/logs/BERT-CAPES/BERT-CAPES-epoch=1-val_loss=0.01.ckpt'
# Logger
logger = TensorBoardLogger(hyperparameters["experiment_name"], name="BASELINE")

# Lighting Tester
tester = pl.Trainer(gpus=1,
                     logger=logger,
                     resume_from_checkpoint=resume_from_checkpoint,
                     amp_level='O2', use_amp=False)
hparams = Namespace(**hyperparameters)
model = BertFinetuner(hparams=hparams,train_dataloader=None,val_dataloader=None,test_dataloader=books_dataloader)


GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


In [15]:
tester.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[1728  672]
 [  12 2388]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       0.99      0.72      0.83      2400
           1       0.78      0.99      0.87      2400

    accuracy                           0.86      4800
   macro avg       0.89      0.86      0.85      4800
weighted avg       0.89      0.86      0.85      4800

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.8747252747252747}
--------------------------------------------------------------------------------

