In [203]:
#@title &nbsp;

from IPython.display import display_html

style = ''''
    @import url('https://fonts.googleapis.com/css2?family=Comfortaa:wght@400;500;700&display=swap');

    body {
        display: flex;
        justify-content: center;
        align-items: center;
        min-height: 100vh;
        margin: 0;
    }

    .container {
        display: flex;
        justify-content: center;
        align-items: center;
        gap: 20px;
        margin-top: 30px;
    }

    img {
        max-width: 100%;
        height: auto;
    }

    h1 {
        margin: 0;
        size: 20px;
    }

'''

display_html(f'''
<style>
    {style}
</style>

<div class='container'>
    <img src='https://gcdnb.pbrd.co/images/hgNPk95VoyK6.png?o=1', width=200>
    <h1>Modeling</h1>
</div>
<br>
<br>
''', raw=True)

# Setting up

In [204]:
# !pip install transformers demoji loguru

In [205]:
from sklearn.model_selection import train_test_split
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import stopwords
import torch.nn.functional as F
from collections import Counter
from google.colab import drive
import plotly.express as px
from sklearn import metrics
from textwrap import wrap
from loguru import logger
from torch import cuda
from torch import nn
import pandas as pd
import transformers
import numpy as np
import string
import demoji
import torch
import nltk

from transformers import (
    get_linear_schedule_with_warmup,
    BertTokenizer,
    BertConfig,
    BertModel
)

from torch.utils.data import (
    SequentialSampler,
    RandomSampler,
    DataLoader,
    Dataset
)

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)

drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/reviews_preprocessed.csv')

df.dropna(inplace=True)

if df.all != None:
    print('Dataset imported successfully 🎉')
else:
    print('Dataset import failed ☹')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset imported successfully 🎉


# Quick EDA

In [206]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [207]:
stopwords = set(stopwords.words('portuguese'))
punctuations = set(string.punctuation)

tokens_frequencies = Counter([item for sublist in df['tokens'] for item in sublist.split() if item not in stopwords and item not in punctuations])

In [208]:
aux = pd.DataFrame(tokens_frequencies.most_common(10), columns=['Item', 'Quantidade'])

fig = px.bar(
    aux,
    x='Item',
    y='Quantidade',
    color='Quantidade'
)

fig.layout.template = 'plotly_dark'
fig.layout.title = 'Frequência de itens comentados'

fig.show()

In [209]:
tokens_amount = [len(tokens) for tokens in df.tokens.values]

fig = px.histogram(
    tokens_amount,
    nbins=100,
    color_discrete_sequence=['orange']
)

fig.layout.title = 'Distribution of token amount'
fig.layout.template = 'plotly_dark'

fig.show()

In [210]:
df['sentimento'] = df.sentimento.map(dict({
    'negative': 0,
    'neutral': 1,
    'positive': 2
}))

# Getting the BERT tokens for the whole dataset

In [211]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

tt = 'Olá, esse é um teste! <3'
test_tokens = tokenizer.encode_plus(
    tt,
    max_length=32,
    add_special_tokens=True,
    return_attention_mask=True,
    return_token_type_ids=True,
    truncation=True,
    padding='max_length',
    return_tensors='pt'
)

print('INPUT IDS')
print(test_tokens['input_ids'])
print('\nATTENTION MASK')
print(test_tokens['attention_mask'])

INPUT IDS
tensor([[  101,  1651, 22303,   117,  1966,   253,   222,  3515,   106,   133,
           511,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])

ATTENTION MASK
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])


In [212]:
def get_tokens(text):
  encodes = tokenizer.encode_plus(
      text,
      max_length=512,
      add_special_tokens=True,
      return_attention_mask=True,
      return_token_type_ids=True,
      truncation=True,
      padding='max_length',
      return_tensors='pt'
  )

  return dict({
      'input_ids': encodes['input_ids'].squeeze(),
      'attention_mask': encodes['attention_mask'].squeeze()
  })

df['input_ids'], df['attention_mask'] = df.texto_refinado.apply(lambda x: get_tokens(x)['input_ids']), df.texto_refinado.apply(lambda y: get_tokens(y)['attention_mask'])

In [213]:
sizes = list([])
for i in df.input_ids:
  sizes.append(i.size()[0])

print(np.array(sizes).max(), np.array(sizes).min(), np.array(sizes).mean())

512 512 512.0


In [214]:
test_tokens['input_ids'].squeeze().size()

torch.Size([32])

# Creating the Pytorch dataset class

In [215]:
class CustomDF(Dataset):
  def __init__(self, text, targets, tokenizer, max_len):
    self.text = text
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.text)
  
  def __getitem__(self, item):
    text = str(self.text[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    return dict({
        'text': text,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'targets': torch.tensor(target, dtype=torch.int64)
    })

# Splitting the data

In [216]:
SEED = 20

train_df, test_df = train_test_split(df, test_size=.2, random_state=SEED)
val_df, test_df = train_test_split(test_df, test_size=.5, random_state=SEED)

print(f'Amount of entries for train: {train_df.shape[0]}')
print(f'Amount of entries for test: {test_df.shape[0]}')
print(f'Amount of entries for validation: {val_df.shape[0]}')

Amount of entries for train: 8636
Amount of entries for test: 1080
Amount of entries for validation: 1080


# Creating a function to create dataloaders

In [217]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  dataset = CustomDF(
      text=df.texto_refinado.to_numpy(),
      targets=df.sentimento.to_numpy(dtype=int),
      tokenizer=tokenizer,
      max_len=max_len
  )

  return DataLoader(
      dataset=dataset,
      batch_size=batch_size,
      num_workers=2,
      drop_last=True
  )

BATCH_SIZE = 32
MAX_LEN = 128

train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)

In [218]:
train_data = next(iter(train_data_loader))
train_data

{'text': ['a nova atualização pile_of_poo ficou horrível navegação app péssima não contar empresas põem valor mínimo compra poder assim fazer venda casada obrigando cliente comprar precisa faturar valores entrega muitas vezes alto próprio pedido onde viu paga 30 00 entrega ??? loucura',
  'péssimo todos sentidos gostaria conhecer gênio inventou fechar refrigetante papel poi entrega nao entorne aí pede pedido feito restaurante 5km distância tendo lado ). resumindo sempre chega frio molhado já app permite cancelar pedido atendimento demora 3h poderia escrever ma limite caracteres permite',
  'a navegação site lenta aplicativo miui traz experiência conflitante muito demorado carrega fazer qualquer coisa fazer primeiro pedido beira frustração difícil novo usuário entender acontecendo taxas cadastrar coisas básicas endereço número telefone tanta lentidão dúvida chega questionar pedido realmente feito fora estética excelente layout só comprei desconto',
  'pop',
  'app legal porém muitos res

In [219]:
print(train_data['input_ids'].size())
print(train_data['attention_mask'].size())
print(train_data['targets'].size())

torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32])


# Modeling

In [220]:
class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
    self.drop = nn.Dropout(p=.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    bmodel = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )

    attentions = bmodel.attentions
    cross_attentions = bmodel.cross_attentions
    hidden_states = bmodel.hidden_states
    last_hidden_state = bmodel.last_hidden_state
    past_key_values = bmodel.past_key_values
    pooler_output = bmodel.pooler_output

    output = self.drop(pooler_output)

    return self.out(output)

In [221]:
print(df.sentimento.unique())
class_names = list(['negative', 'neutral', 'positive'])

model = SentimentClassifier(len(class_names))

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'The device will be using {str.upper(device)}')
model = model.to(device)

[0 1 2]


Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The device will be using CUDA


# Training

Reproducing the procedure from the original BERT paper using the AdamW optimizer by Hugging Face.
It corrects weight decay. I'll be also using a linear scheduler with no warmup steps. 

In [222]:
EPOCHS = 8

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=5e-5
)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

## BERT authors recommendations for fine-tuning

| Hyperparameter | Value |
| -------- | -------- |
| Batch size | `16`, `32` |
| Learning rate (Adam) | `5e-5`, `3e-5`, `2e-5` |
| Number of epochs | `2`, `3`, `4` | 

I'll ignore the number of epochs recommendation but stick with the rest.

> NOTE: 
> Increasing the batch size reduces the training time significantly, but gives lower accuracy.

In [223]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()

  losses = list([])
  correct_predictions = 0

  for data in data_loader:
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    targets = data['targets'].to(device)

    output = model(
        input_ids=input_ids,
        attention_mask=attention_mask
    )

    _, pred = torch.max(output, dim=1)
    loss = loss_fn(output, targets)

    correct_predictions += torch.sum(pred == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = list([])
  correct_predictions = 0

  with torch.no_grad():
    for data in data_loader:
      input_ids = data['input_ids'].to(device)
      attention_mask = data['attention_mask'].to(device)
      targets = data['targets'].to(device)

      output = model(
          input_ids=input_ids,
          attention_mask=attention_mask
      )

      _, pred = torch.max(output, dim=1)
      loss = loss_fn(output, targets)

      correct_predictions += torch.sum(pred == targets)
      losses.append(loss.item())
  
  return correct_predictions / n_examples, np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

try:
  for epoch in range(EPOCHS):
    logger.info(f'EPOCH {epoch + 1}/{EPOCHS}')

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_df)
    )

    logger.info(f'TRAIN LOSS: {train_loss}  |  ACCURACY: {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(val_df)
    )

    logger.info(f'VALIDATION LOSS: {val_loss}  |  ACCURACY: {val_acc}\n\n')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
      best_accuracy = val_acc
      torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/models/best_model_state.bin')
    
  logger.success(f'MODEL TRAINED WITH {EPOCHS} EPOCHS AND ACHIEVED {best_accuracy} OF BEST ACCURACY 🎉')

except Exception as e:
  logger.error(f'ERROR: {str.upper(str(e))}\n')

[32m2023-05-10 01:03:09.343[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mEPOCH 1/8[0m
[32m2023-05-10 01:05:54.132[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mTRAIN LOSS: 0.8204962819482314  |  ACCURACY: 0.6082677245140076[0m
[32m2023-05-10 01:06:01.636[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mVALIDATION LOSS: 0.792832124413866  |  ACCURACY: 0.6268518567085266

[0m
[32m2023-05-10 01:06:03.247[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mEPOCH 2/8[0m
[32m2023-05-10 01:08:55.383[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mTRAIN LOSS: 0.604567931265636  |  ACCURACY: 0.7361046671867371[0m
[32m2023-05-10 01:09:03.050[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mVALIDATION LOSS: 0.906921948447372  |  ACCURACY: 0.6222221851348877

[0m
[32m2023-05-10 01:09:03.057[0m | [1mINFO