# Bert for Cross-Language Plagiarism Detection

Author: João Phillipe Cardenuto

In this notebook we implement a model regarding the Detailed Analysis of the CLPD.

# Import Libraries

In [2]:
! pip install -q pytorch-lightning
! pip install -q transformers

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [5]:
# Comum libraries
import os
import random
from typing import Dict
from typing import List
import numpy as np
import pandas as pd
import re
from argparse import Namespace
from tqdm.notebook  import trange, tqdm_notebook

# Dataset
import sys
sys.path.insert(0, "/work/src/DataloaderCLPD/")
from LoadDataset import *

# Torch
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

# HugginFace
from transformers import BertTokenizer,BertTokenizerFast,BertForSequenceClassification
# Sklearn
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Tersorflow
import tensorboard
%load_ext tensorboard


# Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

# Setup seeds
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu" 

device = torch.device(dev)
print("Device",dev)

if "cuda" in dev:
    print("GPU: ", torch.cuda.get_device_name(0))



Device cuda:0
GPU:  Quadro RTX 5000


## Loading Data

Using LoadDataset module to load capes dataset

In [4]:
# We using sample size of x it represent x*(2*N_negatives + 2) = x*(ENG + PT + ENG_NEGATIVE_1 ... + ENG_NEGATIVE_N +
#                                                                            PT_NEGATIVE_1 + ... + PT_NEGATIVE_N)
capes_dataset = CLPDDataset(name='capes',data_type='train',sample_size=30000,val_size=0.2,n_negatives=1)

# Bert Multilingual Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

train_capes, val_capes = capes_dataset.get_organized_data(tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Processing CAPES TRAIN', max=24000.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='Processing CAPES VALIDATION', max=6000.0, style=ProgressS…




In [13]:
# Samples
print(train_capes[0],"\n", train_capes.pairs[0])
print("Number of Samples:", len(train_capes.pairs))

(tensor([   101,  30228,  10188,  47264,  11481,  59090,  10108,  25965,  40345,
         10114,  10105,  14166,  12720,  18583,    117,  10135,  50081,  10111,
         37241,  65921,    113,  29698,    114,  60026,  10107,    117,  25209,
         10216,  10111,  15092,  19181,    169, 110158,  10188,    169,  73636,
         11131,  16511,    113,    177,  12352,    114,  38607,  37501,  34844,
           171,  47081,  33597,  12454,  30228,  55223,    113,  42076,  10116,
           114,  10111, 106615,  10157,  10870,  10105,  18514,  10108,  14249,
         16813,  26409,    117,  10262,  34899,  73995,  11355,  19065,  10350,
         10111,  10211,  34326,  67253,  16587,  30743,  11942,  32124,  10106,
         14179,  14168,  83600,  10161,  54396,  16813,  10213,    119,    102,
         10427,  57859,  23633,  41178,  13395,    131,  25209,  10129,    173,
         67843,  10129,  10794,  20084,  10104,    171,    119,  30228,  10104,
         20142, 104092,  34778,  10425,

In [16]:
dataloader_debug = DataLoader(train_capes, batch_size=10, shuffle=True, 
                              num_workers=0)

token_ids, attention_mask, token_type_ids, labels, _ = next(iter(dataloader_debug))
print('token_ids:\n', token_ids)
print('token_type_ids:\n', token_type_ids)
print('attention_mask:\n', attention_mask)
print('labels:\n', labels)

print('token_ids.shape:', token_ids.shape)
print('token_type_ids.shape:', token_type_ids.shape)
print('attention_mask.shape:', attention_mask.shape)
print('labels.shape:', labels.shape)

token_ids:
 tensor([[   101, 103502,  77665,  ...,      0,      0,      0],
        [   101,  10271,  26406,  ...,      0,      0,      0],
        [   101,  10346,  10112,  ...,      0,      0,      0],
        ...,
        [   101,  10146,  95359,  ...,      0,      0,      0],
        [   101,  10105,  59598,  ...,      0,      0,      0],
        [   101,  10188,  10105,  ...,      0,      0,      0]])
token_type_ids:
 tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
attention_mask:
 tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
labels:
 tensor([0, 1, 1, 0, 0, 0, 1, 1, 1, 0])
token_ids.shape: torch.Size([10, 200])
token_type_ids.shape: torch.Size(

In [19]:
batch_size = 128
# train_dataloader = DataLoader(dataset_train, batch_size=batch_size,
#                               shuffle=True, num_workers=4)

val_dataloader = DataLoader(val_capes, batch_size=batch_size, shuffle=False, 
                            num_workers=4)

# test_dataloader = DataLoader(dataset_test, batch_size=batch_size,
#                              shuffle=False, num_workers=4)

## BERT-Model with Pytorch Lightning

Doc BertModel:
https://huggingface.co/transformers/model_doc/bert.html#bertmodel

In [6]:
class BertFinetuner(pl.LightningModule):

    def __init__(self, hparams,train_dataloader,val_dataloader,test_dataloader):
        
        super(BertFinetuner, self).__init__()
        
        #Hiperparameters
        self.hparams = hparams

        # Dataloaders
        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader
        self._test_dataloader = test_dataloader
        # Learnning Rate and Loss Function
        self.learning_rate = hparams.learning_rate
        self.lossfunc = torch.nn.CrossEntropyLoss()
        # Optimizer
        self.optimizer = self.hparams.optimizer

        # Retrieve model from Huggingface
        self.model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased').to(device)
        

        # freeze bert embeddings
        for param in self.model.bert.embeddings.parameters():
            param.requires_grad = False
        # freeze bert attention encoders, but release the last five ones
        for layer in self.model.bert.encoder.layer[:-5]:
            for param in layer.parameters():
                param.requires_grad = False


    def forward(self, input_ids, attention_mask, token_type_ids,labels=None):
       
        # If labels are None, It will return a loss and a logit
        # Else it return the predicted logits for each sentence
        return self.model(input_ids=input_ids,
                     attention_mask=attention_mask,
                     token_type_ids=token_type_ids,
                     labels=labels)

    def training_step(self, batch, batch_nb):
        # batch
        input_ids, attention_mask, token_type_ids, label,_ = batch
         
        # fwd
        loss, y_hat = self(input_ids.to(device), attention_mask.to(device), token_type_ids.to(device),label.to(device))
        
        # loss
        # loss = self.lossfunc(y_hat, label) # Using loss from the model
        
        # logs
        tensorboard_logs = {'train_loss': loss.item()}
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_nb):
        # batch
        input_ids, attention_mask, token_type_ids, label,_ = batch
         
        # fwd
        loss, y_hat = self(input_ids.to(device), attention_mask.to(device), token_type_ids.to(device),label.to(device))
        
        # loss
        #loss = self.lossfunc(y_hat, label) # Using loss from the model
        
        # F1 -score
        _, y_hat = torch.max(y_hat, dim=1)
        val_f1 = f1_score(y_pred=y_hat.cpu(), y_true=label.cpu())
        val_f1 = torch.tensor(val_f1)
        
        return {'val_loss': loss, 'val_f1': val_f1}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_val_f1 = torch.stack([x['val_f1'] for x in outputs]).mean()

        tensorboard_logs = {'val_loss': avg_loss.item(), 'avg_val_f1': avg_val_f1.item()}
        
        return {'avg_val_loss': avg_loss.item(), 'avg_val_f1': avg_val_f1.item(), 
                'progress_bar': tensorboard_logs, "log": tensorboard_logs}

    def test_step(self, batch, batch_nb):
        input_ids, attention_mask, token_type_ids, label, pairs = batch
        
        y_hat = self(input_ids.to(device), attention_mask.to(device), token_type_ids.to(device))[0]
        _, y_hat = torch.max(y_hat, dim=1)

        return {'pairs': pairs, 'y_true': label.cpu(), 'y_pred':y_hat.cpu() }

    def test_epoch_end(self, outputs):
        
        
        pairs = [pair for x in outputs for pair in x['pairs']]
        y_true = np.array([ y.item() for x in outputs for y in x['y_true'] ])
        y_pred = np.array([ y.item() for x in outputs for y in x['y_pred'] ])
        
        # Write failure on file
        with open (f"FAILURE_{self.hparams.experiment_name}.txt", 'w') as file:
               for index,pair in enumerate(pairs):
                    if y_true[index] != y_pred[index]:
                        file.write("="*50+f"\n[Y_TRUE={y_true[index]} != Y_PRED={y_pred[index]}]\n"+pair \
                                  +'\n'+"="*50+'\n')
        
        print("CONFUSION MATRIX:")
        print(confusion_matrix(y_true=y_true, y_pred=y_pred))
        
        print("SKLEARN  REPORT")
        print(classification_report(y_true=y_true, y_pred=y_pred))
        
        
        test_f1 =  f1_score(y_pred=y_pred, y_true=y_true)
    
        tensorboard_logs = {'test_f1': test_f1}
        return {'test_f1': test_f1, 'log': tensorboard_logs,
                 'progress_bar': tensorboard_logs}
    
    def configure_optimizers(self):

        optimizer =  self.optimizer(
            [p for p in self.parameters() if p.requires_grad],
            lr=self.learning_rate)
        
        scheduler = StepLR(optimizer, step_size=self.hparams.steplr_epochs, gamma=self.hparams.scheduling_factor)

        return [optimizer], [scheduler]

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

    def test_dataloader(self):
        return self._test_dataloader

In [20]:

hyperparameters = {"experiment_name": "BertCLPD-DEBUG", 
                   "max_epochs": 20,
                   "patience": 4,
                   "optimizer": torch.optim.Adam,
                   "max_length": 400,
                   "scheduling_factor": 0.8, 
                   "learning_rate": 1e-5, 
                   "steplr_epochs":4,
                  }

In [21]:
model = BertFinetuner(hparams=Namespace(**hyperparameters),
                      train_dataloader=val_dataloader,
                      val_dataloader=val_dataloader,
                      test_dataloader=val_dataloader)

##  Number of Parameter 

In [22]:
sum([torch.tensor(x.size()).prod() for x in model.parameters() if x.requires_grad]) # trainable parameters

tensor(36031490)

## Fast dev run

In [23]:
trainer = pl.Trainer(gpus=1, 
                     logger=False,
                     checkpoint_callback=False,  # Disable checkpoint saving.
                     fast_dev_run=True,
                     amp_level='O2', use_amp=False
)
trainer.fit(model)
trainer.test(model)
del model 

Running in fast_dev_run mode: will run a full train, val and test loop using a single batch
GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                                   | Type                          | Params
-----------------------------------------------------------------------------------------------------
0   | lossfunc                                               | CrossEntropyLoss              | 0     
1   | model                                                  | BertForSequenceClassification | 177 M 
2   | model.bert                                             | BertModel                     | 177 M 
3   | model.bert.embeddings                                  | BertEmbeddings                | 92 M  
4   | model.bert.embeddings.word_embeddings                  | Embedding                     | 91 M  
5   | model.bert.embeddings.position_embeddings              | Embedding      

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_f1': tensor(0.9310, dtype=torch.float64)}
--------------------------------------------------------------------------------



## Overfit on a Batch

We notice that easily the model can overfit on a batch

In [24]:
hyperparameters = {"experiment_name": "BertCLPD", 
                   "optimizer": torch.optim.Adam,
                   "max_epochs": 5,
                   "patience": 4,
                    "steplr_epochs":5,
                    "scheduling_factor": 0.95,
                    "learning_rate": 6e-5,
                    "max_length": 400
                   }
trainer = pl.Trainer(gpus=1,
                     logger=False,
                     max_epochs=hyperparameters['max_epochs'],
                     check_val_every_n_epoch=5,
                     checkpoint_callback=False,  # Disable checkpoint saving
                     overfit_pct=0.5,
                     amp_level='O2', use_amp=False)

model = BertFinetuner(hparams=Namespace(**hyperparameters),
                      train_dataloader=val_dataloader,
                      val_dataloader=val_dataloader,
                      test_dataloader=val_dataloader)

trainer.fit(model)
trainer.test(model)


del model  
del trainer

GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                                   | Type                          | Params
-----------------------------------------------------------------------------------------------------
0   | lossfunc                                               | CrossEntropyLoss              | 0     
1   | model                                                  | BertForSequenceClassification | 177 M 
2   | model.bert                                             | BertModel                     | 177 M 
3   | model.bert.embeddings                                  | BertEmbeddings                | 92 M  
4   | model.bert.embeddings.word_embeddings                  | Embedding                     | 91 M  
5   | model.bert.embeddings.position_embeddings              | Embedding                     | 393 K 
6   | model.bert.embeddings.token_type_embeddings            | Embed

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Detected KeyboardInterrupt, attempting graceful shutdown...





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_f1': tensor(1., dtype=torch.float64)}
--------------------------------------------------------------------------------



## Training

In [7]:
# Training will perform a cross-dataset.
# Training on Capes testing on SciElo


max_length =  200
capes_dataset = CLPDDataset(name='capes',data_type='train',sample_size=40000,val_size=0.2,max_length=200,n_negatives=1)
capes_test = CLPDDataset(name='capes',data_type='test',n_negatives=1,max_length=200)

# Bert Multilingual Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

# Traning data 
train_capes, val_capes = capes_dataset.get_organized_data(tokenizer=tokenizer)
test_capes = capes_test.get_organized_data(tokenizer=tokenizer)


HBox(children=(FloatProgress(value=0.0, description='Processing CAPES TRAIN', max=32000.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='Processing CAPES VALIDATION', max=8000.0, style=ProgressS…




HBox(children=(FloatProgress(value=0.0, description='Processing CAPES TEST', max=7903.0, style=ProgressStyle(d…




In [5]:
len(test_capes)

47418

In [8]:
#------tester-----------#
#   DataLoaders     #
#-------------------#

batch_size = 128

train_dataloader = DataLoader(train_capes, batch_size=batch_size,
                              shuffle=True, num_workers=4)

val_dataloader = DataLoader(val_capes, batch_size=batch_size, shuffle=False, 
                            num_workers=4)

test_dataloader = DataLoader(test_capes, batch_size=batch_size,
                             shuffle=False, num_workers=4)



# Hiperparameters
hyperparameters = {"experiment_name": "BERT-CAPES", 
                   "max_epochs": 2,
                    "optimizer": torch.optim.Adam,
                    "patience": 1,
                     "steplr_epochs":1,
                    "scheduling_factor": 0.9,
                    "learning_rate": 1e-5,
                   "max_length":max_length,
                   "batch_size":batch_size,
                   }
#------------------------------#
#       Checkpoints            #
#------------------------------#

log_path = 'logs'
ckpt_path = os.path.join(log_path, hyperparameters["experiment_name"], "-{epoch}-{val_loss:.2f}")  
checkpoint_callback = ModelCheckpoint(prefix=hyperparameters["experiment_name"],  # prefixo para nome do checkpoint
                                      filepath=ckpt_path,  # path onde será salvo o checkpoint
                                      monitor="val_loss", 
                                      mode="min",
                                      save_top_k=2)   
# Hard coded
# resume_from_checkpoint = '/content/drive/My Drive/P_IA376E_2020S1/Class-8 BERT/TASK/logs/Electra-400/Electra-400-epoch=37-val_loss=0.18.ckpt'
resume_from_checkpoint= False
# Configuração do Early Stop
early_stop = EarlyStopping(monitor="val_loss",  
                           patience=hyperparameters["patience"], 
                           verbose=False, 
                           mode='min'  
                           )
logger = TensorBoardLogger(hyperparameters["experiment_name"], name="BASELINE")

# Lighting Trainer
trainer = pl.Trainer(gpus=1,
                     logger=logger,
                     max_epochs=hyperparameters["max_epochs"],
                     check_val_every_n_epoch=1,
                     accumulate_grad_batches=2,
                     checkpoint_callback=checkpoint_callback,
#                       resume_from_checkpoint=resume_from_checkpoint,
                     amp_level='O2', use_amp=False)
hparams = Namespace(**hyperparameters)
model = BertFinetuner(hparams=hparams,train_dataloader=train_dataloader,val_dataloader=val_dataloader,test_dataloader=test_dataloader)

# Train
trainer.fit(model)

GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                                   | Type                          | Params
-----------------------------------------------------------------------------------------------------
0   | lossfunc                                               | CrossEntropyLoss              | 0     
1   | model                                                  | BertForSequenceClassification | 177 M 
2   | model.bert                                             | BertModel                     | 177 M 
3   | model.bert.embeddings                                  | BertEmbeddings                | 92 M  
4   | model.bert.embeddings.word_embeddings                  | Embedding                     | 91 M  
5   | model.bert.embeddings.position_embeddings              | Embedding                     | 393 K 
6   | model.bert.embeddings.token_type_embeddings            | Embed

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…






1

## Test model on Capes dataset

In [9]:
trainer.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[31074   538]
 [  109 15697]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     31612
           1       0.97      0.99      0.98     15806

    accuracy                           0.99     47418
   macro avg       0.98      0.99      0.98     47418
weighted avg       0.99      0.99      0.99     47418

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.9798071221247776}
--------------------------------------------------------------------------------



-----

# Test model on Scielo dataset 

In [10]:

# Bert Multilingual Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

max_length =  200
batch_size = 128

scielo_dataset = CLPDDataset(name='scielo',data_type='test',n_negatives=1,max_length=200)
scielo_dataset = scielo_dataset.get_organized_data(tokenizer)
scielo_dataloader = DataLoader(scielo_dataset, batch_size=batch_size,
                              shuffle=False, num_workers=4)


HBox(children=(FloatProgress(value=0.0, description='Processing SCIELO TEST', max=9956.0, style=ProgressStyle(…




In [11]:
# Hiperparameters
hyperparameters = {"experiment_name": "BERT-CAPES-TEST-ON-SCIELO", 
                   "max_epochs": 1,
                    "optimizer": torch.optim.Adam,
                    "patience": 1,
                     "steplr_epochs":1,
                    "scheduling_factor": 0.9, # argumentos de otimização
                    "learning_rate": 1e-5,
                   "max_length":max_length,
                   "batch_size":batch_size,
                   }
#------------------------------#
#       Checkpoints            #
#------------------------------#

log_path = 'logs'
ckpt_path = os.path.join(log_path, hyperparameters["experiment_name"], "-{epoch}-{val_loss:.2f}")  
# Resume from checkpoint Hard coded 
resume_from_checkpoint= '/work/src/Bert-Baseline/logs/BERT-CAPES/BERT-CAPES-epoch=1-val_loss=0.01.ckpt'
# Logger
logger = TensorBoardLogger(hyperparameters["experiment_name"], name="BASELINE")

# Lighting Tester
tester = pl.Trainer(gpus=1,
                     logger=logger,
                     resume_from_checkpoint=resume_from_checkpoint,
                     amp_level='O2', use_amp=False)
hparams = Namespace(**hyperparameters)
model = BertFinetuner(hparams=hparams,train_dataloader=None,val_dataloader=None,test_dataloader=scielo_dataloader)


GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


In [12]:
tester.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[35937  3887]
 [   10 19902]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       1.00      0.90      0.95     39824
           1       0.84      1.00      0.91     19912

    accuracy                           0.93     59736
   macro avg       0.92      0.95      0.93     59736
weighted avg       0.95      0.93      0.94     59736

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.910825839225647}
--------------------------------------------------------------------------------



-----

## Test on books dataset

In [13]:

# Bert Multilingual Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

max_length =  200
batch_size = 300

books_dataset = CLPDDataset(name='books',data_type='test')

books_dataset = books_dataset.get_organized_data(tokenizer)
books_dataloader = DataLoader(books_dataset, batch_size=batch_size,
                              shuffle=False, num_workers=4)


HBox(children=(FloatProgress(value=0.0, description='Processing BOOKS TEST', max=600.0, style=ProgressStyle(de…




In [14]:
# Hiperparameters
hyperparameters = {"experiment_name": "BERT-CAPES-TEST-ON-BOOKS", 
                   "max_epochs": 5,
                    "optimizer": torch.optim.Adam,
                    "patience": 1,
                     "steplr_epochs":1,
                    "scheduling_factor": 0.9, # argumentos de otimização
                    "learning_rate": 1e-5,
                   "max_length":max_length,
                   "batch_size":batch_size,
                   }
#------------------------------#
#       Checkpoints            #
#------------------------------#

log_path = 'logs'
ckpt_path = os.path.join(log_path, hyperparameters["experiment_name"], "-{epoch}-{val_loss:.2f}")  
# Resume from checkpoint Hard coded 
resume_from_checkpoint= '/work/src/Bert-Baseline/logs/BERT-CAPES/BERT-CAPES-epoch=1-val_loss=0.01.ckpt'
# Logger
logger = TensorBoardLogger(hyperparameters["experiment_name"], name="BASELINE")

# Lighting Tester
tester = pl.Trainer(gpus=1,
                     logger=logger,
                     resume_from_checkpoint=resume_from_checkpoint,
                     amp_level='O2', use_amp=False)
hparams = Namespace(**hyperparameters)
model = BertFinetuner(hparams=hparams,train_dataloader=None,val_dataloader=None,test_dataloader=books_dataloader)


GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


In [15]:
tester.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

CONFUSION MATRIX:
[[1728  672]
 [  12 2388]]
SKLEARN  REPORT
              precision    recall  f1-score   support

           0       0.99      0.72      0.83      2400
           1       0.78      0.99      0.87      2400

    accuracy                           0.86      4800
   macro avg       0.89      0.86      0.85      4800
weighted avg       0.89      0.86      0.85      4800

--------------------------------------------------------------------------------
TEST RESULTS
{'test_f1': 0.8747252747252747}
--------------------------------------------------------------------------------

