# Finetuning BERTimbau for regression

Código baseado no tutorial do The Medium de finetuning de BERT para regressão encontrado 
<a href="https://medium.com/ilb-labs-publications/fine-tuning-bert-for-a-regression-task-is-a-description-enough-to-predict-a-propertys-list-price-cf97cd7cb98a">aqui<a/> 

## Installing dependencies

In [None]:
!pip install transformers

## Setting up variables

In [None]:
DATA_PATH = r'/home/allan_m_ufms_br/tweets.csv'
PRETRAINED_MODEL = 'neuralmind/bert-base-portuguese-cased'

In [None]:
BATCH_SIZE = 8
NUM_EPOCHS = 10

## Dataset

### Importing data

In [None]:
import pandas as pd

cols = ["Datetime","Text","Likes","Retweets"]
data = pd.read_csv(
    DATA_PATH,
    header=0,
    names=cols,
    engine="python",
    encoding="utf-8",
    index_col = False
)

### Creating "Engagement" metric

The engagement metric was created to simplify the problem by having a single value that represents both likes and retweets.

By getting the average number of likes and dividing by the average number of retweets we got that on average a post gets eight times more likes than retweets.

Since reach is way more important that likeness for a business, we decided that engagement would be the number of likes summed to eight times the number of retweets.

In [None]:
data = data.dropna()


data["Engagement"] = data.Likes.astype(int) + (8 * data.Retweets.astype(int))


data.drop(["Datetime","Likes","Retweets"],
          axis=1,
          inplace=True)

### Cleaning up data and tokenizing

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL, do_lower_case=False)

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

def clean_tweet(tweet):
    # remove links
    tweet = re.sub(r'http(\S)+', '', tweet)
    # remove pontuação
    tweet = re.sub(r'[^\w\s]', '', tweet)
    # converte para minúsculas
    tweet = tweet.lower()
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    tweet = emoji_pattern.sub(r'', tweet)
    # remove stop words em português
    stop_words = set(stopwords.words('portuguese'))
    words = nltk.word_tokenize(tweet)
    words = [word for word in words if not word in stop_words]
    # aplica stemização
    stemmer = RSLPStemmer()
    words = [stemmer.stem(word) for word in words]
    # junta as palavras novamente
    tweet = ' '.join(words)
    tweet = tokenizer.tokenize(tweet)
    return tweet

In [None]:
data_clean = data.copy()
data_clean.Text = [clean_tweet(str(tweet)) for tweet in data.Text]

### Normalizing the engagement metric

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

engagement_scaler = StandardScaler()
data_labels = np.array(data_clean['Engagement'].tolist())
data_labels = data_labels.reshape(-1, 1)
engagement_scaler.fit(data_labels)

norm_labels = engagement_scaler.transform(data_labels)

### Splitting data into train, test and validation

First the data is shuffled

In [None]:
TWEETS_USED_FOR_TRAINING = 100_000

shuffle=np.random.randint(0,len(data_clean['Text']),TWEETS_USED_FOR_TRAINING)

y_shuffled = np.array(norm_labels)[shuffle]
x_shuffled = np.array(data_clean['Text'])[shuffle]

The it's separed into three parts

In [None]:
import sklearn.model_selection as model_selection

xtrain, xtest, ytrain, ytest = model_selection.train_test_split(x_shuffled, y_shuffled, test_size=0.30, random_state=42,shuffle=True)
xtrain, xval, ytrain, yval = model_selection.train_test_split(xtrain, ytrain, test_size=0.30, random_state=42,shuffle=True)

In [None]:
train_encodings = tokenizer(xtrain.tolist(), truncation=True, padding=True, max_length=512, is_split_into_words=True, return_tensors='pt')
test_encodings = tokenizer(xtest.tolist(), truncation=True, padding=True,max_length=512, is_split_into_words=True, return_tensors='pt')
val_encodings = tokenizer(xval.tolist(), truncation=True, padding=True,max_length=512, is_split_into_words=True, return_tensors='pt')

And lastly the dataloaders are created

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

def create_dataloaders(inputs, masks, labels, batch_size):
    input_tensor = inputs.clone().detach()
    mask_tensor = masks.clone().detach()
    labels_tensor = torch.tensor(labels)
    dataset = TensorDataset(input_tensor, mask_tensor, labels_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

In [None]:
dl_train = create_dataloaders(train_encodings.input_ids, train_encodings.attention_mask, ytrain, BATCH_SIZE)
dl_test = create_dataloaders(test_encodings.input_ids, test_encodings.attention_mask, ytest, BATCH_SIZE)
dl_val = create_dataloaders(val_encodings.input_ids, val_encodings.attention_mask, yval, BATCH_SIZE)

## Training

### Create model

In [None]:
import torch.nn as nn
from typing import Tuple

class BertEngagementRegressor(nn.Module):
    def __init__(self,model):
        super().__init__()
        self.bert = model.bert
        self.config = model.config
        self.linear = nn.Linear(self.config.hidden_size,200)
        self.dropout = nn.Dropout(p=0.1)
        self.linear2 = nn.Linear(200,1)
        self.double()

    def forward(self, input_ids, attention_masks) ->Tuple[torch.Tensor]:
        output = self.bert(input_ids, attention_masks)[1]
        output = self.linear(output)
        output = self.dropout(output)
        output = self.linear2(output)
        return output.squeeze()

In [None]:
from transformers import AutoModelForPreTraining

model_base= AutoModelForPreTraining.from_pretrained(PRETRAINED_MODEL)
model = BertEngagementRegressor(model=model_base)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

### Setting up for training

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(),lr=5e-5,eps=1e-8)

In [None]:
from transformers import get_linear_schedule_with_warmup

total_steps = len(dl_train) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
loss_function = nn.MSELoss()

### Training the model

In [None]:
def evaluate(model, loss_function, test_dataloader, device):
    model.eval()
    test_loss, test_r2 = [], []
    for batch in test_dataloader:
        batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch)
        batch_labels = torch.squeeze(batch_labels)
        with torch.no_grad():
            outputs = model(batch_inputs, batch_masks)
        loss = loss_function(outputs, batch_labels)
        test_loss.append(loss.item())
        r2 = r2_score(outputs, batch_labels)
        test_r2.append(r2.item())
    return test_loss, test_r2

def r2_score(outputs, labels):
    labels_mean = torch.mean(labels)
    ss_tot = torch.sum((labels - labels_mean) ** 2)
    ss_res = torch.sum((labels - outputs) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2

In [None]:
from torch.nn.utils.clip_grad import clip_grad_norm_

def train(model, optimizer, scheduler, loss_function, train_dataloader, validation_dataloader, device,  clip_value=2):
    max_epoch_no_improve = 5
    cur_epoch = 0
    lowest_loss = 10000
    epochs_since_best = 0
    done = False
    loss_train, loss_val, val_r2 = [], []
    while not done:
      loss_epoch = []
      model.train()
      for step, batch in enumerate(train_dataloader):
          print("epoch:", cur_epoch," - ", step,"/",len(train_dataloader))
          batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch)
          model.zero_grad()
          outputs = model(batch_inputs, batch_masks)           
          loss = loss_function(outputs.squeeze(), batch_labels.squeeze())
          loss.backward()
          loss_epoch.append(loss.cpu().item())
          clip_grad_norm_(model.parameters(), clip_value)
          optimizer.step()
          scheduler.step()
      loss_train.append(np.mean(loss_epoch))
      loss, r2 = evaluate_loss(model, loss_function, validation_dataloader, device)
      mean_loss = np.mean(loss)
      val_r2.append(np.mean(r2))
      loss_val.append(mean_loss)
      epochs_since_best += 1
      if mean_loss < lowest_loss:
          lowest_loss = mean_loss
          epochs_since_best = 0
          print("Best model found! saving...")
          torch.save(model.state_dict(),f'./regressor_state_dict_{cur_epoch}.pth')
          torch.save(model, f'./regressor_model_{cur_epoch}.pth')
      if epochs_since_best > max_epoch_no_improve:
          done = True
      print("epoch: %d, loss_train: %4.3f, last_best: %d"%(cur_epoch,mean_loss,epochs_since_best))
      print("-----")
      loss_train.append(mean_loss)
      cur_epoch += 1
    return model, loss_train, loss_val, r2_val

In [None]:
model, loss_train, loss_val, r2_val = train(model, optimizer, scheduler, loss_function, dl_train, dl_val, device, clip_value=2)

In [None]:
torch.save(model, "final_regressor_model.pth")
with open("loss_values.txt", "a") as file:
  file.write("loss_train:",str(loss_train),"\n")
  file.write("loss_val:",str(loss_val),"\n")
  file.write("r2_val:",str(r2_val),"\n")

## Post-processing

### Plotting losses

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
x=range(len(loss_train))
ax.plot(x, loss_train, label='test_loss')
ax.plot(x, loss_val, label='test_r2')

ax.set_xlabel('Epochs')
ax.set_ylabel('loss')
ax.legend()

plt.show()

### Testing results with test dataset

In [None]:
def predict(model, dataloader, device):
    model.eval()
    output = []
    for batch in dataloader:
        batch_inputs, batch_masks, _ = tuple(b.to(device) for b in batch)
        with torch.no_grad():
            output += model(batch_inputs, batch_masks).view(1,-1).tolist()[0]
    return output

In [None]:
y_pred = predict(model, dl_test, device)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

mae = mean_absolute_error(ytest, y_pred)
print(f"mean absolute error:{mae}")
mdae = median_absolute_error(ytest, y_pred)
print(f"median_absolute_error:{mdae}")
mse = mean_squared_error(ytest, y_pred)
print(f"mean_squared_error:{mse}")
mape = mean_absolute_percentage_error(ytest, y_pred)
print(f"mean_absolute_percentage_error:{mape}")
mdape = ((pd.Series(ytest) - pd.Series(y_pred))\pd.Series(ytest)).abs().median()
print(f"median_absolute_percentage_error:{mdape}")
r2 = r2_score(ytest, y_pred)
print(f"regression_score:{r2}")