# Comparação na Task de Análise de Sentimentos Binária utilizando Modelos Baseados em BERT.

Este notebook tem como objetivo explorar a capacidade de modelos baseados em BERT treinar em poucas épocas dentro do
problema de análise de sentimentos binária.

In [1]:
!pip install transformers datasets pytorch-lightning sklearn

You should consider upgrading via the 'd:\projects\sentiment-analysis\.env\scripts\python.exe -m pip install --upgrade pip' command.


In [2]:
# Carregando Datasets

from datasets import load_dataset
from sklearn.model_selection import train_test_split

if not 'yelp' in locals():
    yelp = load_dataset('yelp_polarity')
    yelp['train'], yelp['validation'] = train_test_split(yelp['train'], test_size=0.1, random_state=42)

if not 'imdb' in locals():
    imdb = load_dataset('imdb')
    imdb['train'], imdb['validation'] = train_test_split(imdb['train'], test_size=0.1, random_state=42)

if not 'rotten_tomatoes' in locals():
    rotten_tomatoes = load_dataset('rotten_tomatoes')
    rotten_tomatoes['train'], rotten_tomatoes['validation'] = train_test_split(rotten_tomatoes['train'], test_size=0.1, random_state=42)


Reusing dataset yelp_polarity (C:\Users\Mikael\.cache\huggingface\datasets\yelp_polarity\plain_text\1.0.0\2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c51a98c)
Reusing dataset imdb (C:\Users\Mikael\.cache\huggingface\datasets\imdb\plain_text\1.0.0\90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)
Using custom data configuration default
Reusing dataset rotten_tomatoes_movie_review (C:\Users\Mikael\.cache\huggingface\datasets\rotten_tomatoes_movie_review\default\1.0.0\9198dbc50858df8bdb0d5f18ccaf33125800af96ad8434bc8b829918c987ee8a)


In [3]:
# Carregando Modelos

from transformers import AutoTokenizer, AutoModelForSequenceClassification

bert_tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
bert_model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny")

#roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
#roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-base")

#distil_bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
#distil_bert_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

#distil_roberta_tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
#distil_roberta_model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base")

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

In [4]:
param_list = ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
for name, param in bert_model.named_parameters():
    if not name in param_list:
        param.required_grad = False

In [5]:
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

class SentimentAnalysisClassifier(pl.LightningModule):
    def __init__(self, bert, tokenizer, dataset, batch_size=32, optimizer=torch.optim.AdamW, learning_rate=1e-6):
        super().__init__()
        self.bert = bert
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.train_dataset = SentimentDataset(dataset, tokenizer)
        self.val_dataset = SentimentDataset(dataset, tokenizer, kind='validation')
        self.test_dataset = SentimentDataset(dataset, tokenizer, kind='test')
        self.train_acc = pl.metrics.Accuracy()
        self.valid_acc = pl.metrics.Accuracy()

    def configure_optimizers(self):
        optimizer = self.optimizer(self.parameters(), lr=self.learning_rate)
        return optimizer

    def forward(self, x):
        inputs = {key: torch.squeeze(value) for key, value in x.items()}
        result = self.bert(**inputs)
        return result

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)[0]
        loss = F.cross_entropy(y_hat, y)
        self.train_acc(y_hat, y)
        return {'loss': loss}
    
    def training_epoch_end(self, losses):
        train_epoch_loss = torch.Tensor([x['loss'] for x in losses]).mean().item()
        self.log('train_loss', train_epoch_loss, prog_bar=True)
        self.log('train_acc', self.train_acc.compute(), prog_bar=True)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)[0]
        loss = F.cross_entropy(y_hat, y)
        self.valid_acc(y_hat, y)
        return {'val_loss': loss}

    def validation_epoch_end(self, losses):
        val_epoch_loss = torch.Tensor([x['val_loss'] for x in losses]).mean().item()
        self.log('val_loss', val_epoch_loss, prog_bar=True)
        self.log('val_acc', self.valid_acc.compute().item(), prog_bar=True)

    def train_dataloader(self):
        train_dl = DataLoader(self.train_dataset, self.batch_size)
        return train_dl
    
    def val_dataloader(self):
        validation_dl = DataLoader(self.val_dataset, self.batch_size)
        return validation_dl

    def test_dataloader(self):
        test_dl = DataLoader(self.test_dataset, self.batch_size)
        return test_dl

In [6]:
class SentimentDataset(Dataset):
    def __init__(self, ds, tokenizer, kind='train', max_length=512):
        self.inputs = ds[kind]['text']
        self.outputs = torch.LongTensor(ds[kind]['label'])
        self.tokenizer = tokenizer
        self.kind = kind
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):

        tokenized_inputs = self.tokenizer(self.inputs[idx],
            padding='max_length',
            max_length=self.max_length,
            truncation=True,
            return_tensors='pt')

        return ({
            'input_ids': tokenized_inputs['input_ids'],
            'token_type_ids': tokenized_inputs['token_type_ids'],
            'attention_mask': tokenized_inputs['attention_mask'],
        }, self.outputs[idx])

In [7]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

batch_size = 128
dataset = yelp
epochs = 100

model = SentimentAnalysisClassifier(bert_model, dataset=dataset, tokenizer=bert_tokenizer, optimizer=torch.optim.AdamW, batch_size=batch_size)

trainer = pl.Trainer(max_epochs=epochs, gpus=1, auto_lr_find=True, callbacks=[EarlyStopping(monitor='val_loss')])
trainer.tune(model)
trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                          | Params
------------------------------------------------------------
0 | bert      | BertForSequenceClassification | 4 M   
1 | train_acc | Accuracy                      | 0     
2 | valid_acc | Accuracy                      | 0     
Finding best initial lr: 100%|██████████| 100/100 [00:42<00:00,  2.35it/s]
Learning rate set to 0.002754228703338169

  | Name      | Type                          | Params
------------------------------------------------------------
0 | bert      | BertForSequenceClassification | 4 M   
1 | train_acc | Accuracy                      | 0     
2 | valid_acc | Accuracy                      | 0     
Epoch 1:   9%|▉         | 386/4376 [02:44<38:11,  1.74it/s, loss=0.363, v_num=34, train_loss=1.08, train_acc=0.573]

1