In [30]:
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re

from tqdm import tqdm


np.random.seed(42)

# pre processamento dos dados

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
path = '/content/drive/MyDrive/B2W-Reviews01.csv'

In [4]:
#df = pd.read_csv('../data/B2W-Reviews01.csv')
df = pd.read_csv(path)

  df = pd.read_csv(path)


In [5]:
cols = ['review_text']
df = df[cols]
df.rename(columns={'review_text': 'texto'}, inplace=True)

In [6]:
df = df.dropna()

# tarefa 1 - regressao na quantidade de vogais

In [7]:
def calcular_densidade_vogais(texto):
    texto = re.sub(r'[^A-Za-zÇçÃãÁáÉéÍíÓóÚúÀàÊêÔô]', '', texto)  # Remover caracteres não-alfabéticos
    total_letras = len(texto)
    total_vogais = len(re.findall(r'[AEIOUaeiouÁÉÍÓÚáéíóúÀàÃãÊêÔô]', texto))
    return total_vogais / total_letras if total_letras > 0 else 0

In [8]:
df['densidades'] = df['texto'].apply(calcular_densidade_vogais)

In [9]:
# 10 000  sentencas para aumentar a velocidade de treinamento
# TODO: Taking the first samples may induce bias
df = df[:10000]
print(len(df))

10000


In [10]:
from sklearn.model_selection import train_test_split

# Separar o conjunto de dados em treino, teste e validacao
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df  = train_test_split(train_df, test_size=0.25, random_state=42)

print(len(train_df), 'train examples')
print(len(val_df), 'validation examples')
print(len(test_df), 'test examples')

6000 train examples
2000 validation examples
2000 test examples


In [11]:
#train_df.head(2)
len(train_df)

6000

## BERTimbau fine tunning

In [12]:
!pip install transformers torch




In [13]:
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

# Tokenizer e Modelo do BERTimbau
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
model = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Classe do Dataset
class TextoDataset(Dataset):
    def __init__(self, textos, densidades):
        self.textos = textos
        self.densidades = densidades

    def __len__(self):
        return len(self.textos)

    def __getitem__(self, idx):
        texto = self.textos[idx]
        densidade = self.densidades[idx]
        inputs = tokenizer(texto, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        return inputs, torch.tensor(densidade, dtype=torch.float)

# Criar instâncias do TextoDataset
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_dataset = TextoDataset(train_df['texto'], train_df['densidades'])
validate_dataset = TextoDataset(val_df['texto'], val_df['densidades'])
test_dataset = TextoDataset(test_df['texto'], test_df['densidades'])

# Criar DataLoaders
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validate_loader = DataLoader(validate_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [14]:
class BertForRegression(nn.Module):
    def __init__(self, bert_model):
        super(BertForRegression, self).__init__()
        self.bert = bert_model
        self.regressor = nn.Linear(768, 1)  # 768 é a dimensão do vetor de características do BERT

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(pooled_output)

model = BertForRegression(model)


In [15]:
device = torch.device("cuda")
model.to(device)

# Função de perda e otimizador
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [16]:
from tqdm import tqdm

def train_model(model, criterion, optimizer, train_loader, validate_loader, epochs=3):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        # Train loop with tqdm
        train_loop = tqdm(train_loader, leave=True)
        for inputs, labels in train_loop:
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            labels = labels.to(device)

            input_ids = input_ids.view(-1, input_ids.size(-1))
            attention_mask = attention_mask.view(-1, attention_mask.size(-1))

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            outputs = outputs.view(-1, 1)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            # Update tqdm description with current loss
            train_loop.set_description(f'Epoch {epoch+1}')
            train_loop.set_postfix(loss=loss.item())

        model.eval()
        val_loss = 0.0
        print('\n Validating...')

        # Validation loop with tqdm
        validate_loop = tqdm(validate_loader, leave=True)
        with torch.no_grad():
            for inputs, labels in validate_loop:
                input_ids = inputs['input_ids'].to(device)
                attention_mask = inputs['attention_mask'].to(device)
                labels = labels.to(device)

                input_ids = input_ids.view(-1, input_ids.size(-1))
                attention_mask = attention_mask.view(-1, attention_mask.size(-1))

                outputs = model(input_ids, attention_mask)
                outputs = outputs.view(-1, 1)

                # Colocar ideia aqui
                loss = criterion(outputs, labels)
                val_loss += loss.item()


                validate_loop.set_postfix(val_loss=loss.item())

        print(f"Epoch {epoch+1}, Training Loss: {running_loss/len(train_loader)}, Validation Loss: {val_loss/len(validate_loader)}")


In [17]:
train_model(model, criterion, optimizer, train_loader, validate_loader, epochs=3)

  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 1: 100%|██████████| 375/375 [02:56<00:00,  2.13it/s, loss=0.00103]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.03it/s, val_loss=0.00131]


Epoch 1, Training Loss: 0.00398437637106205, Validation Loss: 0.001700983684277162


Epoch 2: 100%|██████████| 375/375 [02:55<00:00,  2.14it/s, loss=0.00923]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.00it/s, val_loss=0.00121]


Epoch 2, Training Loss: 0.001985398019508769, Validation Loss: 0.0015744098897557705


Epoch 3: 100%|██████████| 375/375 [02:55<00:00,  2.14it/s, loss=0.000478]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.00it/s, val_loss=0.0017]

Epoch 3, Training Loss: 0.0017883167186907181, Validation Loss: 0.001958635548595339





In [18]:
# Saving the entire model
torch.save(model, 'reg_model.pth')

In [73]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr
import numpy as np

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def mape(y_true, y_pred):
    y_true += 1e-5 # Numeric problems
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def pearson_correlation(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [74]:
def eval_test(model, test_loader):
    y_pred_model = []
    y_true       = []

    # Evaluating the model of density of vowels per sentence on the test set
    model.eval()
    with torch.no_grad():
        for inputs, labels in test_loader:
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            labels = labels.to(device)

            input_ids = input_ids.view(-1, input_ids.size(-1))
            attention_mask = attention_mask.view(-1, attention_mask.size(-1))

            outputs = model(input_ids, attention_mask)
            outputs = outputs.view(-1, 1)

            y_pred_model.extend(outputs.view(-1).cpu().numpy())
            y_true.extend(labels.cpu().numpy())

    # Convert predictions and true values to numpy arrays
    y_pred_model = np.array(y_pred_model)
    y_true = np.array(y_true)
    return y_true, y_pred_model

In [75]:
# Evaluating the model of density of vowels per sentence on the test set
y_true_sentence, y_pred_model_sentence = eval_test(model, test_loader)

In [76]:
# Evaluating the model of density of vowels on the global density of the test
# Use the train dataset to prevent leakage
global_density = list(train_df.texto.values)
separator = ' '
global_density = calcular_densidade_vogais(separator.join(global_density))

test_df2 = test_df.copy()
test_df2 = test_df2.reset_index(drop=True)
test_df2['densidades'] = global_density

test_dataset2 = TextoDataset(test_df2['texto'], test_df2['densidades'])
test_loader2 = DataLoader(test_dataset2, batch_size=BATCH_SIZE)

y_true_global, y_pred_model_global = eval_test(model, test_loader2)

In [77]:
# Evaluating the model of density of vowels on the first word density
def get_first_word_density(text):
    list_word = text.split(' ')
    word = list_word[0]
    return calcular_densidade_vogais(word)

first_density = list(test_df['texto'].apply(lambda x : get_first_word_density(x)).values)

test_df3 = test_df.copy()
test_df3 = test_df3.reset_index(drop=True)
test_df3['densidades'] = first_density

test_dataset3 = TextoDataset(test_df3['texto'], test_df3['densidades'])
test_loader3 = DataLoader(test_dataset3, batch_size=BATCH_SIZE)

y_true_first, y_pred_model_first = eval_test(model, test_loader3)

In [78]:
# Evaluating the model of density of vowels on the last word density
def get_last_word_density(text):
    list_word = text.split(' ')
    word = list_word[len(list_word)-1]
    return calcular_densidade_vogais(word)

last_density = list(test_df['texto'].apply(lambda x : get_first_word_density(x)).values)

test_df4 = test_df.copy()
test_df4 = test_df4.reset_index(drop=True)
test_df4['densidades'] = last_density

test_dataset4 = TextoDataset(test_df4['texto'], test_df4['densidades'])
test_loader4 = DataLoader(test_dataset4, batch_size=BATCH_SIZE)

y_true_last, y_pred_model_last = eval_test(model, test_loader4)


In [79]:
def get_metrics(y_true, y_pred_model):
    return {
        "RMSE": rmse(y_true, y_pred_model),
        "MAE": mae(y_true, y_pred_model),
        "MAPE": mape(y_true, y_pred_model),
        "R2": r2_score(y_true, y_pred_model),
        "Pearson": pearson_correlation(y_true, y_pred_model)
    }

In [80]:
metrics_dict_first = get_metrics(y_true_first, y_pred_model_first)
metrics_dict_last = get_metrics(y_true_last, y_pred_model_last)
metrics_dict_sentence = get_metrics(y_true_sentence, y_pred_model_sentence)
metrics_dict_global = get_metrics(y_true_global, y_pred_model_global)

metrics_df = pd.DataFrame([metrics_dict_first,
                           metrics_dict_last,
                           metrics_dict_global,
                           metrics_dict_sentence], index=
                            ['Using only the first word',
                             'Using only the last word',
                             'Using the global density',
                             'Using the density per sentence'])
metrics_df



Unnamed: 0,RMSE,MAE,MAPE,R2,Pearson
Using only the first word,0.242824,0.163226,46250.650024,-0.2881663,0.024313
Using only the last word,0.242824,0.163226,46250.650024,-0.2881663,0.024313
Using the global density,0.018572,0.018131,3.759262,-24297190000.0,
Using the density per sentence,0.046373,0.029782,6907.958221,-0.1634248,0.13449


# Classification problem

In [31]:
def classify_vowel_density(density):
    if density < 1/3:
        return 0  # Class 1
    elif 1/3 <= density <= 2/3:
        return 1  # Class 2
    else:
        return 2  # Class 3

# Applying the classification function to the 'densidades' column
df['labels'] = df['densidades'].apply(classify_vowel_density)


In [32]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df  = train_test_split(train_df, test_size=0.25, random_state=42)

print(len(train_df), 'train examples')
print(len(val_df), 'validation examples')
print(len(test_df), 'test examples')

6000 train examples
2000 validation examples
2000 test examples


In [33]:
# Classe do Dataset
class TextoDataset(Dataset):
    def __init__(self, textos, labels):
        self.textos = textos
        self.labels = labels

    def __len__(self):
        return len(self.textos)

    def __getitem__(self, idx):
        texto = self.textos[idx]
        label = self.labels[idx]
        inputs = tokenizer(texto, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        input_ids = inputs['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = inputs['attention_mask'].squeeze(0)  # Remove batch dimension
        return input_ids, attention_mask, torch.tensor(label, dtype=torch.long)

# Preparando DataFrames (assumindo que você já os tem divididos em treino, validação e teste)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Substitua 'densidades' por uma coluna apropriada de etiquetas de classe
train_dataset = TextoDataset(train_df['texto'], train_df['labels'])
validate_dataset = TextoDataset(val_df['texto'], val_df['labels'])
test_dataset = TextoDataset(test_df['texto'], test_df['labels'])

# Criar DataLoaders
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validate_loader = DataLoader(validate_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


## Defining and training the model

## Unbalanced case

In [34]:
class BertForClassification(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertForClassification, self).__init__()
        self.bert = bert_model
        # Assuming 768-dimensional feature vectors from BERT, with num_classes outputs
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

# Example usage
bert_model = BertModel.from_pretrained('bert-base-uncased')  # Use a pre-trained BERT model
model = BertForClassification(bert_model, num_classes=3)  # Specify the number of classes

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss function and optimizer for classification (using CrossEntropyLoss for multi-class)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [38]:
def train_model(model, criterion, optimizer, train_loader, validate_loader, num_classes, epochs=3):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        # Train loop with tqdm
        train_loop = tqdm(train_loader, leave=True)
        for input_ids, attention_mask, labels in train_loop:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)

            # For classification, outputs are likely to be logits, need to reshape for CrossEntropyLoss
            outputs = outputs.view(-1, num_classes)  # num_classes is the number of classes in your classification task

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Update tqdm description with current loss
        train_loop.set_description(f'Epoch {epoch+1}')
        train_loop.set_postfix(loss=loss.item())

        model.eval()
        val_loss = 0.0
        print('\n Validating...')

        # Validation loop with tqdm
        validate_loop = tqdm(validate_loader, leave=True)
        with torch.no_grad():
            for input_ids, attention_mask, labels in validate_loop:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                outputs = model(input_ids, attention_mask)
                outputs = outputs.view(-1, num_classes)  # Adjusted to match the number of classes

                loss = criterion(outputs, labels)
                val_loss += loss.item()

                validate_loop.set_postfix(val_loss=loss.item())

        print(f"Epoch {epoch+1}, Training Loss: {running_loss/len(train_loader)}, Validation Loss: {val_loss/len(validate_loader)}")

In [39]:
train_model(model, criterion, optimizer, train_loader, validate_loader, epochs=3, num_classes=3)

100%|██████████| 375/375 [02:53<00:00,  2.16it/s]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.01it/s, val_loss=0.000997]


Epoch 1, Training Loss: 0.010754324772395194, Validation Loss: 0.01702773686358705


100%|██████████| 375/375 [02:54<00:00,  2.15it/s]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.03it/s, val_loss=0.0143]


Epoch 2, Training Loss: 0.0077529994996730234, Validation Loss: 0.021387031000107526


100%|██████████| 375/375 [02:54<00:00,  2.15it/s]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.01it/s, val_loss=0.00117]


Epoch 3, Training Loss: 0.005964060601700718, Validation Loss: 0.014728623283910565


100%|██████████| 375/375 [02:53<00:00,  2.16it/s]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.03it/s, val_loss=0.000512]


Epoch 1, Training Loss: 0.003872952530509792, Validation Loss: 0.01952215628011618


100%|██████████| 375/375 [02:53<00:00,  2.16it/s]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.04it/s, val_loss=0.00055]


Epoch 2, Training Loss: 0.0028478361713932827, Validation Loss: 0.018527215126494412


100%|██████████| 375/375 [02:53<00:00,  2.16it/s]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.04it/s, val_loss=0.000111]


Epoch 3, Training Loss: 0.002267196642506557, Validation Loss: 0.020303341994353106


100%|██████████| 375/375 [02:53<00:00,  2.16it/s]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.04it/s, val_loss=7.81e-5]


Epoch 1, Training Loss: 0.002039365710147346, Validation Loss: 0.01999900148474262


100%|██████████| 375/375 [02:53<00:00,  2.16it/s]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.09it/s, val_loss=4.98e-5]


Epoch 2, Training Loss: 0.0016443868269755816, Validation Loss: 0.022644582486653235


100%|██████████| 375/375 [02:53<00:00,  2.17it/s]



 Validating...


100%|██████████| 125/125 [00:20<00:00,  6.09it/s, val_loss=6.19e-5]

Epoch 3, Training Loss: 0.0018175426693827223, Validation Loss: 0.025689835368015338





In [40]:
# Saving the entire model
torch.save(model, 'classification_model.pth')

In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score

def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return np.array(all_preds), np.array(all_labels)

# Get predictions and true labels
preds, labels = evaluate_model(model, test_loader, device)

# Total accuracy
total_accuracy = accuracy_score(labels, preds)

# Confusion matrix
conf_matrix = confusion_matrix(labels, preds)
per_class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
sensitivity = np.diag(conf_matrix) / np.sum(conf_matrix, axis=1)
specificity = np.diag(conf_matrix) / np.sum(conf_matrix, axis=0)

# Output the metrics
print("Total Accuracy:", total_accuracy)
print("Per Class Accuracy:", per_class_accuracy)
print("Sensitivity (Recall) per Class:", sensitivity)
print("Specificity per Class:", specificity)


Total Accuracy: 0.998
Per Class Accuracy: [0.81818182 0.99899346 1.        ]
Sensitivity (Recall) per Class: [0.81818182 0.99899346 1.        ]
Specificity per Class: [0.81818182 0.99899346 1.        ]


## Balanced case

In [None]:
"W"