# Comparação na Task de Análise de Sentimentos Binária utilizando Modelos Baseados em BERT.

Este notebook tem como objetivo explorar a capacidade de modelos baseados em BERT treinar em poucas épocas dentro do
problema de análise de sentimentos binária.

In [1]:
# Carregando Datasets

from datasets import load_dataset

if not 'yelp' in locals():
    yelp = load_dataset('yelp_polarity')

if not 'imdb' in locals():
    imdb = load_dataset('imdb')

if not 'rotten_tomatoes' in locals():
    rotten_tomatoes = load_dataset('rotten_tomatoes')

print(yelp)
print(imdb)
print(rotten_tomatoes)

Reusing dataset yelp_polarity (C:\Users\Mikael\.cache\huggingface\datasets\yelp_polarity\plain_text\1.0.0\2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c51a98c)
Reusing dataset imdb (C:\Users\Mikael\.cache\huggingface\datasets\imdb\plain_text\1.0.0\90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)
Using custom data configuration default
Reusing dataset rotten_tomatoes_movie_review (C:\Users\Mikael\.cache\huggingface\datasets\rotten_tomatoes_movie_review\default\1.0.0\9198dbc50858df8bdb0d5f18ccaf33125800af96ad8434bc8b829918c987ee8a)
DatasetDict({'train': Dataset(features: {'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['1', '2'], names_file=None, id=None)}, num_rows: 560000), 'test': Dataset(features: {'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['1', '2'], names_file=None, id=None)}, num_rows: 38000)})
DatasetDict({'train': Dataset(features: {'text': Value(dtype='string', id=None), 'label

In [2]:
# Gerando Modelos

from transformers import AutoTokenizer, AutoModelForSequenceClassification

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-base")

distil_bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
distil_bert_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

distil_roberta_tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
distil_roberta_model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

class SentimentAnalysisClassifier(pl.LightningModule):
    def __init__(self, bert, tokenizer, optimizer, batch_size=32, ):
        super().__init__()
        self.bert = bert
        self.optimizer = optimizer
        self.tokenizer = tokenizer

    def configure_optimizers(self):
        optimizer = self.optimizer(self.parameters())
        return optimizer

    def forward(self, x):
        inputs = {key: torch.squeeze(value) for key, value in x.items()}
        result = self.bert(**inputs)
        return result

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)[0]
        loss = F.cross_entropy(y_hat, y)
        result = pl.TrainResult(loss)
        result.log('train_loss', loss, prog_bar=True)
        return result


In [4]:
class SentimentDataset(Dataset):
    def __init__(self, ds, tokenizer, kind='train'):
        self.inputs = ds[kind]['text']
        self.outputs = torch.LongTensor(ds[kind]['label'])
        self.tokenizer = tokenizer
        self.kind = kind

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):

        tokenized_inputs = self.tokenizer(self.inputs[idx],
            padding='max_length',
            truncation=True,
            return_tensors='pt')

        return ({
            'input_ids': tokenized_inputs['input_ids'],
            'token_type_ids': tokenized_inputs['token_type_ids'],
            'attention_mask': tokenized_inputs['attention_mask'],
        }, self.outputs[idx])

ds = SentimentDataset(yelp, bert_tokenizer)

In [5]:
dl = DataLoader(ds, batch_size=2)

model = SentimentAnalysisClassifier(bert_model, tokenizer=bert_tokenizer, optimizer=torch.optim.AdamW)
trainer = pl.Trainer(max_epochs=10, gpus=[0])
trainer.fit(model, dl)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type                          | Params
-------------------------------------------------------
0 | bert | BertForSequenceClassification | 109 M 
Epoch 0:   0%|          | 85/280000 [00:22<20:59:18,  3.70it/s, loss=0.695, v_num=32, train_loss=0.903]