# Evaluación de modelos

In [None]:
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from fastai.text.all import *
from fastai.basics import *
from fastai.callback.all import *

from transformers import BertForMaskedLM, BertForSequenceClassification, BertTokenizer

import shap

from sklearn import metrics
from sklearn.model_selection import train_test_split

import os

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## Predicciones sobre modelos de lenguaje

### AWD-LSTM

In [None]:
df = pd.read_csv('../wikipedia/es_wiki_dataset.csv')
dls_lm = DataBlock(blocks=TextBlock.from_df('text', is_lm=True),
                    get_x=ColReader('text'),
                    splitter=ColSplitter()).dataloaders(df, bs=128, seq_len=72)

# dls_lm = torch.load('../model/es_wiki_dls.pth')
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3, pretrained=False,
    metrics=[accuracy, Perplexity()],
    model_dir='../model').to_fp16()


learn = learn.load('es_wiki')

In [None]:
TEXT = "Mi amigo estudia"
N_WORDS = 10
N_SENTENCES = 1
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75) 
         for _ in range(N_SENTENCES)]
preds

In [None]:
df = pd.read_csv('../wikipedia/books_dataset.csv')
dls_finanzas = DataBlock(blocks=TextBlock.from_df('text', is_lm=True),
                    get_x=ColReader('text'),
                    splitter=ColSplitter()).dataloaders(df, bs=8, seq_len=512)

learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3, pretrained=False,
    metrics=[accuracy, Perplexity()],
    model_dir='../model').to_fp16()


learn = learn.load('es_wiki_fin')

In [None]:
TEXT = "Mi amigo estudia"
N_WORDS = 10
N_SENTENCES = 1
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75) 
         for _ in range(N_SENTENCES)]
preds

### BERT

In [None]:
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', do_lower_case=True)
model = BertForMaskedLM.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
e = model.eval()

In [None]:
text = "[CLS] La economía del país es [MASK]. [SEP]"

tokens = tokenizer.tokenize(text)
masked_indxs = [tokens.index('[MASK]')]
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
tokens_tensor = torch.tensor([indexed_tokens])

predictions = model(tokens_tensor)[0]

for i, midx in enumerate(masked_indxs):
    idxs = torch.argsort(predictions[0,midx], descending=True)
    predicted_token = tokenizer.convert_ids_to_tokens(idxs[:10])
    print('MASK',i,':',predicted_token)

In [None]:
state_dict = torch.load('../input/es-wiki/beto_lm_state_dict_v2.pth', map_location=torch.device(device))
model.load_state_dict(state_dict)

In [None]:
text = "[CLS] La economía del país es [MASK]. [SEP]"

tokens = tokenizer.tokenize(text)
masked_indxs = [tokens.index('[MASK]')]
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
tokens_tensor = torch.tensor([indexed_tokens])

predictions = model(tokens_tensor)[0]

for i, midx in enumerate(masked_indxs):
    idxs = torch.argsort(predictions[0,midx], descending=True)
    predicted_token = tokenizer.convert_ids_to_tokens(idxs[:10])
    print('MASK',i,':',predicted_token)

## Predicciones sobre modelos de regresión

### AWD-LSTM

In [None]:
learn = text_classifier_learner(train_dls, AWD_LSTM, pretrained=False, drop_mult=0.5,
                                    n_out=6,
                                    y_range=(0, 1), metrics=[rmse],
                                    model_dir='../model').to_fp16()
learn.load('es_wiki_reg')

In [None]:
test_dl = learn.dls.test_dl(test_df['bio'])
preds, _ = learn.get_preds(dl = test_dl)
preds = torch.nan_to_num(preds)

In [None]:
mean_squared_error(test_df[['F', 'D/C', 'A/C/F', 'L', 'P', 'Ac']], preds, squared=False, multioutput='raw_values')
mean_absolute_error(test_df[['F', 'D/C', 'A/C/F', 'L', 'P', 'Ac']], preds, multioutput='raw_values')
r2_score(test_df[['F', 'D/C', 'A/C/F', 'L', 'P', 'Ac']], preds, multioutput='raw_values')

### BERT

In [None]:
class Finances_Dataset(Dataset):
    def __init__(self, data, maxlen, tokenizer):
        self.df = data.reset_index()
        self.tokenizer = tokenizer
        self.maxlen = maxlen

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        excerpt = self.df.loc[index, 'bio']
        try:
            target = self.df.loc[index, ['F', 'D/C', 'A/C/F', 'L', 'P', 'Ac']]
        except:
            target = 0.0
        tokens = self.tokenizer.tokenize(excerpt) 
        tokens = ['[CLS]'] + tokens + ['[SEP]'] 
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] 
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]']
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens) 
        input_ids = torch.tensor(input_ids)
        attention_mask = (input_ids != 0).long()
        
        target = torch.tensor(target, dtype=torch.float32)
        
        return input_ids, attention_mask, target

In [None]:
def predict(model, dataloader, device):
    predicted_label = []
    actual_label = []
    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
                        
            predicted_label += output.logits
            actual_label += target
            
    return predicted_label

In [None]:
model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', problem_type='multi_label_regression', num_labels=6)

In [None]:
model.classifier = nn.Sequential(nn.BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
                                nn.Dropout(p=0.2, inplace=False),
                                nn.Linear(in_features=768, out_features=50, bias=False),
                                nn.ReLU(inplace=True),
                                nn.BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
                                nn.Dropout(p=0.1, inplace=False),
                                nn.Linear(in_features=50, out_features=6, bias=False),
                                nn.Sigmoid()
                                )

state_dict = torch.load('./models/beto_fin_state_dict.pt', map_location=torch.device(device))
model.load_state_dict(state_dict, strict=False)

# for param in model.bert.parameters():
#     param.requires_grad = False

model = model.to(device)

criterion = nn.MSELoss()

optimizer = optim.AdamW(params=model.parameters(), lr=1e-4, eps=1e-8)

tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', do_lower_case=True)

In [None]:
model.eval()

In [None]:
texto = test_df
texto = Finances_Dataset(data=texto, maxlen=512, tokenizer=tokenizer)
texto = DataLoader(dataset=texto, batch_size=8, num_workers=1)
y_pred = predict(model, texto, device)

y_true = np.array(test_df[['F', 'D/C', 'A/C/F', 'L', 'P', 'Ac']])

y_pred = [t.cpu() for t in y_pred]
y_pred = [np.array(t) for t in y_pred]

In [None]:
mean_squared_error(y_true, y_pred, squared=False, multioutput='raw_values')
mean_absolute_error(y_true, y_pred, multioutput='raw_values')
r2_score(y_true, y_pred, multioutput='raw_values')

## Interpretabilidad

### AWD-LSTM

In [None]:
def predict_F(lista):
    test_dl = learn.dls.test_dl(pd.DataFrame(lista, columns=['text']))
    preds, _ = learn.get_preds(dl = test_dl)
    preds = torch.nan_to_num(preds)
    return [t[5] for t in preds]

In [None]:
def custom_tokenizer(s, return_offsets_mapping=True):
    pos = 0
    offset_ranges = []
    input_ids = []
    for m in re.finditer(r"\W", s):
        start, end = m.span(0)
        offset_ranges.append((pos, start))
        input_ids.append(s[pos:start])
        pos = end
    if pos != len(s):
        offset_ranges.append((pos, len(s)))
        input_ids.append(s[pos:])
    out = {}
    out["input_ids"] = input_ids
    if return_offsets_mapping:
        out["offset_mapping"] = offset_ranges
    return out

In [None]:
lista = [test_df['bio'].loc[131], test_df['bio'].loc[680],
        test_df['bio'].loc[812], test_df['bio'].loc[897],
        test_df['bio'].loc[956], test_df['bio'].loc[872]]
lista

In [None]:
tokenizer = train_dls.tokenizer
explainer = shap.Explainer(predict_F, masker=shap.maskers.Text(custom_tokenizer), max_evals=1500)

shap_values = explainer(lista)

In [None]:
shap.plots.text(shap_values[0])

In [None]:
shap.plots.bar(shap_values.abs.max(0))

### BERT

In [None]:
df = pd.read_csv('../input/es-wiki/Etiquetado_Mano1.csv')
train_df, test_df = train_test_split(df, test_size=0.3, random_state=21)

In [None]:
texto = test_df.loc[[131,680,812,897,956,872]]
texto = Finances_Dataset(data=texto, maxlen=512, tokenizer=tokenizer)
texto = DataLoader(dataset=texto, batch_size=8, num_workers=1)
predict(model, texto, device)

In [None]:
def predict_F(text_list):
    text = pd.DataFrame(text_list, columns=['bio'])
    texto = Finances_Dataset(data=text, maxlen=512, tokenizer=tokenizer)
    dataloader = DataLoader(dataset=texto, batch_size=8, num_workers=1)
    predicted_label = []
    actual_label = []
    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
            predicted_label += output.logits
            actual_label += target
            
    return [t[5].cpu() for t in predicted_label]

In [None]:
lista = [test_df['bio'].loc[131], test_df['bio'].loc[680],
        test_df['bio'].loc[812], test_df['bio'].loc[897],
        test_df['bio'].loc[956], test_df['bio'].loc[872]]
lista

In [None]:
explainer = shap.Explainer(predict_F, tokenizer)

In [None]:
shap_values = explainer(lista)

In [None]:
shap.plots.bar(shap_values.abs.max(0), show=False)
plt.savefig('shap_bert_Ac_total.png')

In [None]:
shap.plots.text(shap_values[5])

## Gráficos

In [None]:
import pandas as pd
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/es-wiki/Etiquetado_Mano1.csv')
df.head()

In [None]:
results = Counter()
df['bio'].str.lower().str.strip().str.replace('.', '').str.replace(',', '').str.split().apply(results.update)
stopwords_sp = set(stopwords.words('spanish'))
s = pd.DataFrame(list(stopwords_sp))
s['count'] = 1
s = s.set_index(0)
s

In [None]:
all_words = palabras
all_words = all_words.drop(labels=s.index, errors='ignore')
all_words = all_words.sort_values(by=0, ascending=False)

Gráfico de barras

In [None]:
all_words = all_words[0:1000].squeeze()
data = [go.Bar(
            x = all_words.values[2:30],
            y = all_words.index.values[2:30],
            marker= dict(colorscale='Jet',
                         color = all_words.values[2:100]
                        ),
            text='Word counts',
            orientation='h',
    )]

layout = go.Layout(
    yaxis=dict(autorange='reversed'),
    height=900,
    font=dict(size=14)
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

Nube de palabras

In [None]:
all_words = df['text'].str.split(expand=True).unstack().str.strip().str.lower().value_counts()
all_words = all_words.drop(labels=s.index, errors='ignore')

In [None]:
cloud = WordCloud(width=1600, height=800, max_font_size=160, background_color="white").generate_from_frequencies(all_words)
plt.figure( figsize=(20,10) )
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.show()
cloud.to_file("wordcloud_bios.png")