# Modelo BERT

In [None]:
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch import nn

from transformers import BertForSequenceClassification, BertTokenizer

from sklearn import metrics
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange

import os
import shutil

In [None]:
os.getcwd()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
class Finances_Dataset(Dataset):
    def __init__(self, data, maxlen, tokenizer):
        self.df = data.reset_index()
        self.tokenizer = tokenizer
        self.maxlen = maxlen

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        excerpt = self.df.loc[index, 'bio']
        try:
            target = self.df.loc[index, ['F', 'D/C', 'A/C/F', 'L', 'P', 'Ac']]
        except:
            target = 0.0
        tokens = self.tokenizer.tokenize(excerpt) 
        tokens = ['[CLS]'] + tokens + ['[SEP]'] 
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] 
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]']
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens) 
        input_ids = torch.tensor(input_ids)
        attention_mask = (input_ids != 0).long()
        
        target = torch.tensor(target, dtype=torch.float32)
        
        return input_ids, attention_mask, target

In [None]:
def train(model, criterion, optimizer, train_loader, val_loader, epochs, device):
    best_acc = 0
    plot_dict = {}
    for epoch in trange(epochs, desc="Epoch"):
        model.train()
        train_loss = 0
        for i, (input_ids, attention_mask, target) in enumerate(iterable=train_loader):
            optimizer.zero_grad()  
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            
            output = model(input_ids=input_ids, attention_mask=attention_mask)

            loss = criterion(output.logits, target.type_as(output.logits))
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        print(f"Perdida entrenamiento: {train_loss/len(train_loader)}")
        val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
        print(f"Epoca {epoch} completada. Perdida validacion: {val_loss}")
        plot_dict[epoch] = [train_loss/len(train_loader), val_loss]
        
    pd.DataFrame.from_dict(plot_dict, orient='index', columns=['train_loss', 'valid_loss']).to_csv('epoch_metrics_bert.csv', index=False)

In [None]:
def evaluate(model, criterion, dataloader, device):
    model.eval()
    mean_acc, mean_loss, count = 0, 0, 0

    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
            
            mean_loss += criterion(output.logits, target.type_as(output.logits)).item()
            count += 1
            
    return mean_loss/count

In [None]:
def predict(model, dataloader, device):
    predicted_label = []
    actual_label = []
    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
                        
            predicted_label += output.logits
            actual_label += target
            
    return predicted_label

In [None]:
def get_rmse(output, target):
    err = torch.sqrt(metrics.mean_squared_error(target, output))
    return err

In [None]:
df_reg = pd.read_csv('../input/es-wiki/Etiquetado_Mano1.csv')
df_reg[['F', 'D/C', 'A/C/F', 'L', 'P', 'Ac']] = df_reg[['F', 'D/C', 'A/C/F', 'L', 'P', 'Ac']].apply(pd.to_numeric)

In [None]:
train_df, test_df = train_test_split(df_reg, test_size=0.3, random_state=21)

df = train_df

In [None]:
model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', problem_type='multi_label_regression', num_labels=6)

In [None]:
model.classifier = nn.Sequential(nn.BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
                                nn.Dropout(p=0.2, inplace=False),
                                nn.Linear(in_features=768, out_features=50, bias=False),
                                nn.ReLU(inplace=True),
                                nn.BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
                                nn.Dropout(p=0.1, inplace=False),
                                nn.Linear(in_features=50, out_features=6, bias=False),
                                nn.Sigmoid()
                                )

state_dict = torch.load('./models/beto_fin_state_dict.pt', map_location=torch.device(device))
model.load_state_dict(state_dict, strict=False)

for param in model.bert.parameters():
    param.requires_grad = False

model = model.to(device)

criterion = nn.MSELoss()

optimizer = optim.AdamW(params=model.parameters(), lr=1e-3, eps=1e-8)

tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', do_lower_case=True)

In [None]:
train_data, test_data = train_test_split(df_reg, test_size=0.3, random_state=21)
train_data, validation = train_test_split(train_data, test_size=0.2, random_state=21)
train_data.shape, validation.shape, test_data.shape # suma 1027

In [None]:
train_set = Finances_Dataset(data=train_data, maxlen=512, tokenizer=tokenizer)
valid_set = Finances_Dataset(data=validation, maxlen=512, tokenizer=tokenizer)
test_set = Finances_Dataset(data=test_data, maxlen=512, tokenizer=tokenizer)

train_loader = DataLoader(dataset=train_set, batch_size=8, num_workers=1, shuffle=True)
valid_loader = DataLoader(dataset=valid_set, batch_size=8, num_workers=1, shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=8, num_workers=1)

In [None]:
train(model=model, 
    criterion=criterion,
    optimizer=optimizer, 
    train_loader=train_loader,
    val_loader=valid_loader,
    epochs = 10,
    device = device)

In [None]:
torch.save(model.state_dict(), './models/beto_fin_state_dict.pt')