In [82]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import sys
import shutil

df_train = pd.read_csv('train_ptbr.csv')
df_test = pd.read_csv('test_ptbr.csv')

In [83]:
df_train.head()

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,ptbr_train_track_a_00001,Moça eu fiz uma análise com o Tinder uma vez e...,0,0,0,0,0,0
1,ptbr_train_track_a_00002,"eles sempre mostram a míseria, em como tudo er...",1,1,0,0,0,0
2,ptbr_train_track_a_00003,eu nunca quis tanto algo como quero você. jama...,0,0,0,1,1,0
3,ptbr_train_track_a_00004,esperando aqui o stf vim me emparedar do eu ex...,1,0,0,0,0,0
4,ptbr_train_track_a_00005,"e no final o PS vai pagar o pato, ja é o 4° tr...",1,0,0,0,0,0


In [84]:
df_train.columns

Index(['id', 'text', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'], dtype='object')

In [85]:
df_test.head()

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,ptbr_test_track_a_00001,me recuso a sair de casa nesse frio do caralho...,1,0,0,0,0,0
1,ptbr_test_track_a_00002,tenho desde camisola nova até vestido de festa...,0,0,0,1,0,0
2,ptbr_test_track_a_00003,"minha playlist de descobertas veio com trap, r...",1,0,0,0,0,0
3,ptbr_test_track_a_00004,"Então, acho que ele gosta de homem e nem sabe ...",0,0,0,0,0,0
4,ptbr_test_track_a_00005,"Parabéns, Prof. Fernando. Parabéns às educador...",0,0,0,1,0,0


In [86]:
df_train.drop(columns=['id'], inplace=True)
df_test.drop(columns=['id'], inplace=True)

target_list = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

In [87]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [88]:
class CustomDataset (torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = self.df['text']
        self.targets = self.df[target_list].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'targets': torch.FloatTensor(self.targets[idx])
        }


In [89]:
# train_size = 0.8
# df_train_sample = df_train.sample(frac=train_size, random_state=42).reset_index(drop=True)
# df_validation = df_train.drop(df_train_sample.index).reset_index(drop=True)

In [90]:
custom_train_df = CustomDataset(df_train, tokenizer)
custom_validation_df = CustomDataset(df_test, tokenizer)

In [91]:
train_data_loader = torch.utils.data.DataLoader(
    custom_train_df,
    shuffle=True
)

validation_data_loader = torch.utils.data.DataLoader(
    custom_validation_df,
    shuffle=False
)


In [92]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [93]:
def load_checkpoint(checkpoint_file, model, optimizer):
    checkpoint = torch.load(checkpoint_file)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, valid_loss_min.item(), checkpoint['epoch']

def save_checkpoint(state, is_best, checkpoint_path, best_model_path):
    file_path = checkpoint_path
    torch.save(state, file_path)
    if is_best:
        best_file_path = best_model_path
        shutil.copyfile(file_path, best_file_path)

In [94]:
class BERTClass(nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(p=0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, 6)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output
    
model = BERTClass()
model.to(device)

BERTClass(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [95]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [96]:
def train_model(n_epochs, train_data_loader, validation_data_loader, model, optimizer, checkpoint_path, best_model_path):
    valid_loss_min = np.inf
    for epoch in range(n_epochs):

        # training phase
        model.train()
        train_loss = 0.0
        for index, batch in enumerate(train_data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            targets = batch['targets'].to(device)
            output = model(input_ids, attention_mask, token_type_ids)
            optimizer.zero_grad()
            loss = loss_fn(output, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += (1/ (index + 1)) * (loss.item() - train_loss)

        # validation phase
        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for index, batch in enumerate(validation_data_loader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                targets = batch['targets'].to(device)
                outputs = model(input_ids, attention_mask, token_type_ids)
                loss = loss_fn(outputs, targets)
                valid_loss += (1/ (index + 1)) * (loss.item() - valid_loss)

        # checkpoint dictionary
        checkpoint = {
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'valid_loss_min': valid_loss_min
        }

        save_checkpoint(checkpoint, valid_loss < valid_loss_min, checkpoint_path, best_model_path)
    
    return model

In [97]:
trained_model = train_model(
    n_epochs=2,
    train_data_loader=train_data_loader,
    validation_data_loader=validation_data_loader,
    model=model,
    optimizer=optimizer,
    checkpoint_path="./checkpoint.pth",
    best_model_path="./best_model.pth"
)

In [99]:
example_text = "Me sinto feliz!"

encodings = tokenizer.encode_plus(
    example_text,
    add_special_tokens=True,
    return_tensors="pt",
    padding="max_length",
    max_length=128,
    truncation=True
)

model.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    token_type_ids = encodings['token_type_ids'].to(device)
    output = model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    print(f"Output for '{example_text}': {final_output}")


Output for 'Me sinto feliz!': [[0.31508126854896545, 0.024211129173636436, 0.02929740957915783, 0.3021167814731598, 0.13009771704673767, 0.06569305062294006]]
