<a href="https://colab.research.google.com/github/mlpbraga/sexism-detection-notebooks/blob/main/bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 22.6MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 41.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 47.0MB/s 
Installing 

In [2]:
import transformers
import torch
import math
import pandas as pd
from tqdm.notebook import trange, tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando: {device}")

Usando: cuda


In [4]:
class SentimentModel(torch.nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.pretrained_model = pretrained_model
        self.dense1           = torch.nn.Linear(in_features=768 * 512, out_features=512)
        self.dense2           = torch.nn.Linear(in_features=512, out_features=1)
        self.dropout          = torch.nn.Dropout(0.2)
        self.relu             = torch.nn.ReLU()
        self.sigmoid          = torch.nn.Sigmoid()

    def forward(self, X):
        y_hat = self.pretrained_model(X['input_ids'], X['attention_mask'])
        y_hat = y_hat[0].flatten(start_dim=1)
        y_hat = self.dense1(y_hat)
        y_hat = self.relu(y_hat)
        y_hat = self.dropout(y_hat)
        y_hat = self.dense2(y_hat)
        y_hat = self.sigmoid(y_hat)
        return y_hat

In [5]:
class DisasterDataset(torch.utils.data.Dataset):
  def __init__(self, X_input_ids, X_attention_mask, y):
    self.X_input_ids = X_input_ids
    self.X_attention_mask = X_attention_mask
    self.y = y

  def __getitem__(self, index):
    _x_ii = self.X_input_ids[index]
    _x_am = self.X_attention_mask[index]
    _y = self.y[index]
    return _x_ii, _x_am, _y

  def __len__(self):
    return len(self.X_input_ids)

In [None]:
from sklearn.model_selection import train_test_split

labelad_data = pd.read_csv('https://raw.githubusercontent.com/mlpbraga/sexism-detection-notebooks/main/data/labeled-comments.csv')
dataset = labelad_data[['content','label']]

pretrained_model = transformers.AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased').to(device=device)
tokenizer = transformers.AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

epochs = 5
X, y = list(dataset['content']), list(dataset['label'])

metrics = []
for i in range(0, 10):
  print(f'-------------------------------- {i+1} -------------------------------------')
  X_train_original, X_test_original, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

  for param in pretrained_model.parameters():
    param.requires_grad = False

  X_train = tokenizer(X_train_original,
                    padding='max_length',
                    truncation=True,
                    max_length=512,
                    return_tensors='pt').to(device)

  y_train = torch.Tensor(y_train).to(device=device).view(-1, 1)

  dd_train = DisasterDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
  dl_train = torch.utils.data.DataLoader(dd_train, batch_size=64, shuffle=True)

  sm = SentimentModel(pretrained_model).to(device)
  sm.train()

  criterion = torch.nn.BCELoss()
  optimizer = torch.optim.Adam(sm.parameters(), lr=0.0001)

  results = []
  losses_train_mean = []
  for epoch in range(epochs):
    losses_train = []
    losses_val = []
    for batch in tqdm(dl_train):
      X_ii_b, X_am_b, y_b = batch
      X_ii_b = X_ii_b.to(device)
      X_am_b = X_am_b.to(device)
      y_b = y_b.to(device)

      optimizer.zero_grad()
      output = sm.forward({
          'input_ids': X_ii_b,
          'attention_mask': X_am_b
      })

      loss_train = criterion(output, y_b)
      losses_train.append(loss_train.item())
      loss_train.backward()
      optimizer.step()

    geral_loss = sum(losses_train)/len(dl_train)
    losses_train_mean.append(geral_loss)
    print(f"Epoch: {epoch}, Loss: {round(geral_loss, 4)}")

  # test
  X_test = tokenizer(X_test_original,
                    padding='max_length',
                    truncation=True,
                    max_length=512,
                    return_tensors='pt').to('cpu')
  y_test = torch.Tensor(y_test).to(device='cpu').view(-1, 1)


  dd_test = DisasterDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
  dl_test = torch.utils.data.DataLoader(dd_test, batch_size=64, shuffle=True)

  all_results = []
  for batch in tqdm(dl_test):
    X_ii_b, X_am_b, y_b = batch
    X_ii_b = X_ii_b.to(device)
    X_am_b = X_am_b.to(device)
    y_b = y_b.to(device)
    sm.eval()
    with torch.no_grad():
      results = sm.forward({
        'input_ids': X_ii_b,
        'attention_mask': X_am_b
      }).cpu()
      for element in results:
        all_results.append(element)

  y_pred = []
  for result in all_results:
    value = 1 if result > 0.5 else 0
    y_pred.append(value)

  y_test_ = []
  for result in y_test:
    value = int(result)
    y_test_.append(value)

  metrics.append((precision_score(y_test_, y_pred),
                  recall_score(y_test_, y_pred),
                  f1_score(y_test_, y_pred)))

  print(f'Metrics:', (precision_score(y_test_, y_pred),
                  recall_score(y_test_, y_pred),
                  f1_score(y_test_, y_pred)))
  print(classification_report(y_test_, y_pred))


Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


-------------------------------- 1 -------------------------------------


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 0, Loss: 1.0954


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 1, Loss: 0.5749


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 2, Loss: 0.4981


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 3, Loss: 0.4184


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 4, Loss: 0.3548


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Metrics: (0.5701357466063348, 0.32727272727272727, 0.4158415841584158)
              precision    recall  f1-score   support

           0       0.48      0.71      0.57       330
           1       0.57      0.33      0.42       385

    accuracy                           0.50       715
   macro avg       0.52      0.52      0.49       715
weighted avg       0.53      0.50      0.49       715

-------------------------------- 2 -------------------------------------


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 0, Loss: 1.2246


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 1, Loss: 0.6138


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 2, Loss: 0.5505


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 3, Loss: 0.4861


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 4, Loss: 0.4335


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Metrics: (0.5480769230769231, 0.44415584415584414, 0.49067431850789095)
              precision    recall  f1-score   support

           0       0.47      0.57      0.52       330
           1       0.55      0.44      0.49       385

    accuracy                           0.50       715
   macro avg       0.51      0.51      0.50       715
weighted avg       0.51      0.50      0.50       715

-------------------------------- 3 -------------------------------------


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 0, Loss: 1.018


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 1, Loss: 0.4974


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 2, Loss: 0.4132


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 3, Loss: 0.3315


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 4, Loss: 0.2573


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Metrics: (0.512396694214876, 0.4831168831168831, 0.4973262032085561)
              precision    recall  f1-score   support

           0       0.43      0.46      0.45       330
           1       0.51      0.48      0.50       385

    accuracy                           0.47       715
   macro avg       0.47      0.47      0.47       715
weighted avg       0.48      0.47      0.47       715

-------------------------------- 4 -------------------------------------


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 0, Loss: 1.099


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 1, Loss: 0.498


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 2, Loss: 0.3898


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 3, Loss: 0.3157


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 4, Loss: 0.2555


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Metrics: (0.5803108808290155, 0.5818181818181818, 0.5810635538261997)
              precision    recall  f1-score   support

           0       0.51      0.51      0.51       330
           1       0.58      0.58      0.58       385

    accuracy                           0.55       715
   macro avg       0.55      0.55      0.55       715
weighted avg       0.55      0.55      0.55       715

-------------------------------- 5 -------------------------------------


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 0, Loss: 0.9322


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 1, Loss: 0.5167


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 2, Loss: 0.4078


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 3, Loss: 0.3246


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 4, Loss: 0.2585


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Metrics: (0.5211640211640212, 0.5116883116883116, 0.5163826998689384)
              precision    recall  f1-score   support

           0       0.44      0.45      0.45       330
           1       0.52      0.51      0.52       385

    accuracy                           0.48       715
   macro avg       0.48      0.48      0.48       715
weighted avg       0.48      0.48      0.48       715

-------------------------------- 6 -------------------------------------


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 0, Loss: 1.0447


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 1, Loss: 0.5466


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 2, Loss: 0.4445


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 3, Loss: 0.3762


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 4, Loss: 0.3052


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Metrics: (0.5674931129476584, 0.535064935064935, 0.5508021390374331)
              precision    recall  f1-score   support

           0       0.49      0.52      0.51       330
           1       0.57      0.54      0.55       385

    accuracy                           0.53       715
   macro avg       0.53      0.53      0.53       715
weighted avg       0.53      0.53      0.53       715

-------------------------------- 7 -------------------------------------


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 0, Loss: 1.09


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 1, Loss: 0.571


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 2, Loss: 0.4909


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 3, Loss: 0.4155


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 4, Loss: 0.3637


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Metrics: (0.5601503759398496, 0.38701298701298703, 0.4577572964669739)
              precision    recall  f1-score   support

           0       0.47      0.65      0.55       330
           1       0.56      0.39      0.46       385

    accuracy                           0.51       715
   macro avg       0.52      0.52      0.50       715
weighted avg       0.52      0.51      0.50       715

-------------------------------- 8 -------------------------------------


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 0, Loss: 1.2002


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 1, Loss: 0.5747


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 2, Loss: 0.5062


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 3, Loss: 0.4281


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 4, Loss: 0.3508


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Metrics: (0.5353159851301115, 0.37402597402597404, 0.44036697247706424)
              precision    recall  f1-score   support

           0       0.46      0.62      0.53       330
           1       0.54      0.37      0.44       385

    accuracy                           0.49       715
   macro avg       0.50      0.50      0.48       715
weighted avg       0.50      0.49      0.48       715

-------------------------------- 9 -------------------------------------


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 0, Loss: 1.1171


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 1, Loss: 0.5744


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 2, Loss: 0.495


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 3, Loss: 0.434


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 4, Loss: 0.3688


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Metrics: (0.5400516795865633, 0.5428571428571428, 0.5414507772020725)
              precision    recall  f1-score   support

           0       0.46      0.46      0.46       330
           1       0.54      0.54      0.54       385

    accuracy                           0.50       715
   macro avg       0.50      0.50      0.50       715
weighted avg       0.50      0.50      0.50       715

-------------------------------- 10 -------------------------------------


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 0, Loss: 1.0455


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 1, Loss: 0.5521


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))


Epoch: 2, Loss: 0.4619


HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))

In [1]:
precision_0 = [0.48, 0.47,0.43,0.51,0.44,0.49,0.47,0.46,0.46,0.46]
precision_1 = [0.57, 0.55,0.51,0.58,0.52,0.57,0.56,0.54,0.54,0.53]
recall_0    = [0.71, 0.57,0.46,0.51,0.45,0.52,0.65,0.62,0.46,0.60]
recall_1    = [0.33,0.44,0.48,0.58,0.51,0.54,0.39,0.37,0.54,0.40]
f1_0        = [0.57,0.52,0.45,0.51,0.45,0.51,0.55,0.53,0.46,0.52]
f1_1        = [0.42,0.49,0.50,0.58,0.52,0.55,0.46,0.44,0.54,0.46]


In [2]:
def media(lista):
    return sum(lista)/10

In [4]:
media(precision_0),media(precision_1)

(0.46699999999999997, 0.547)

In [5]:
media(recall_0), media(recall_1)

(0.5549999999999999, 0.458)

In [6]:
media(f1_0), media(f1_1)

(0.507, 0.496)