In [None]:
import json
import matplotlib.pyplot as plt
import pandas as pd
import os
import torch

from torch.nn.functional import cross_entropy
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", None)
pd.set_option("display.max_rows", None)

# Show dev Predictions Stats from the Best Model

The `dev_predictions.tsv` file is found in the `best_model` directory and has the following format:

        text    labels  predicted_labels        loss    probabilities   label   predicted_label
0        eso ! en no escuchar el despertador ! y te dormÃ­s nervioso ...con miedo...y peor es cuando te acostumbraste a dormir tarde     2       2       0.05242778      [0.00769282 0.04338438 0.9489229 ]      N       N

In [None]:
base_dir="/Users/fperez/dev/data/spanishclassfier_exp/dccuchile-distilbert-base-spanish-uncased-finetuned-with-spanish-tweets-clf-cleaned-ds/ep_2-lr_5e-5-msl_72-bs_8-ds_config_80-10-10-nl_5-do_0.2/"
dir_src=os.path.join(base_dir, "best_model", "dev_predictions.tsv")

preds_df = pd.read_csv(dir_src, sep="\t", index_col=0)
preds_df.head(32)

## Basic stats on the examples read

In [None]:
print(f"""
Total examples: {preds_df.count()[0]}
Correctly classified: {preds_df.query('label == predicted_label').count()[0]}
Misclassified: {preds_df.query('label != predicted_label').count()[0]}
""")

In [None]:
tokenizer_config = {
    "padding": "max_length",
    "truncation": True,
    "max_length": 72,
}
tokenizer = AutoTokenizer.from_pretrained(os.path.join(base_dir, "best_model")) #, **tokenizer_config)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(os.path.join(base_dir, "best_model"))
tc_pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, padding="max_length", truncation=True, max_length=72)

In [None]:
tc_pipe(preds_df.at[1, 'text'])

In [None]:
def to_df_label_str(pred_label: str):
    if pred_label =="positivo":
        return "P"
    if pred_label =="negativo":
        return "N"
    if pred_label =="neutral":
        return "NEU"
    raise ValueError(f"label str not recognized: {pred_label}")


In [None]:
import numpy as np
def eval_on_tc_pipeline(df):
    for i in range(len(df)):
        example = df.at[i, 'text'] 
        output = tc_pipe(example, return_all_scores=True)
        # print(output)
        # print(example)
        probas = {}
        for pred in output[0]:
            probas[pred['label']] = pred['score']
        probas_arr = np.array([probas["P"], probas["NEU"], probas["N"]])
        # print(probas_arr)
        df.at[i, 'probabilities_tc_pipe'] = str(probas_arr)
        df.at[i, 'predicted_label_tc_pipe'] = pipe.model.config.id2label[int(np.argmax(probas_arr))]
        # print(df)
    return df

preds_df_2 = eval_on_tc_pipeline(preds_df)

In [None]:
preds_df_2.head(32)

In [None]:
print(f"""
Total examples: {preds_df_2.count()[0]}
Correctly classified: {preds_df_2.query('label == predicted_label_tc_pipe').count()[0]}
Misclassified: {preds_df_2.query('label != predicted_label_tc_pipe').count()[0]}
Mismatchs: {preds_df_2.query('predicted_label != predicted_label_tc_pipe').count()[0]}
""")

In [None]:
print(tokenizer.model_input_names)

def forward_pass_with_label(batch):
    # print(f"Batch of type {type(batch)}\n {batch}")
    example = tokenizer(batch.text, padding="max_length", truncation=True, max_length=72)
    # print(f"Tokenized example:\n{example}")
    # print("-----")
    inputs = {k: torch.tensor(v).view(1,-1).to(model.device) for k, v in example.items() if k in tokenizer.model_input_names}
    # print(f"Selected inputs: {inputs}")
    with torch.no_grad():
        output = model(**inputs)
        # print(f"Model predictions:\n{output}")
        probabilities = torch.softmax(output.logits, dim=-1)
        # print(f"Model probabilities:\n{probabilities}")
        pred_label = torch.argmax(output.logits, axis=-1)
        pred_label_str = model.config.id2label[int(pred_label[0].cpu().numpy())]
        # print(f"Predicted label: {pred_label} -> {pred_label_str}")
        loss = cross_entropy(
            output.logits[0], torch.tensor(batch['labels']), reduction="none"
        )
    return {
        "loss_raw_model": loss.cpu().numpy(),
        "predicted_label_raw_model": pred_label_str,
        "probabilities_raw_model": probabilities[0].cpu().numpy(),
    }



preds_df2 = preds_df_2.copy()

preds_df2 = pd.concat([preds_df2, preds_df2[['text', 'labels']].apply(
    forward_pass_with_label, axis=1, result_type="expand"
)], axis=1)

preds_df2

In [None]:
print(f"""
Total examples: {preds_df2.count()[0]}
Matched classifications tc_pipe vs raw_model: {preds_df2.query('predicted_label_tc_pipe == predicted_labels_raw_model').count()[0]}
Misclassified tc_pipe vs raw_model: {preds_df2.query('predicted_label_tc_pipe != predicted_labels_raw_model').count()[0]}
Mismatchs raw model: {preds_df2.query('predicted_label != predicted_labels_raw_model').count()[0]}
""")

# tc_pipe and raw_model preds they don't match with original predictions and they should!

## Check Misclassified Examples by Loss

In [None]:
mislabeled_df = preds_df.query('label != predicted_label').sort_values(by="loss", ascending=False).reset_index()

In [None]:
mislabeled_df.drop('index', axis=1, inplace=True)
mislabeled_df.head()

In [None]:
prompt = "El sentimiento es [MASK]"
prompt

In [None]:
example = mislabeled_df.at[0, 'text']
example

In [None]:
mask_pipeline=pipeline("fill-mask", model="dccuchile/bert-base-spanish-wwm-uncased-finetuned-mldoc")

In [None]:
import torch.nn as nn
def apply_dropout(m):
    if type(m) == nn.Dropout:
        # print(m)
        m.eval()

mask_pipeline.model.apply(apply_dropout)

In [None]:
def eval_on_mm(df):
    for i in range(len(df)):
        example = df.at[i, 'text'] 
        # example = "Hola. Que tal estas?"
        example = example + " " + prompt
        output = mask_pipeline(example, targets=["negativo", "neutral", "positivo"])
        # print(output)
        # print(example)
        for element in output:

            label = df.at[i, 'label']
            
            if label == "P" and element['token_str'] =="positivo":
                correct_label = "(GT)"
            elif label == "NEU" and element['token_str'] =="neutral":
                correct_label = "(GT)"
            elif label == "N" and element['token_str'] =="negativo":
                correct_label = "(GT)"
            else:
                correct_label = ""
            # print(f"Token {element['token_str']}\t{element['score']:.6f}% {correct_label}")

        
        df.at[i, 'mm_predicted_label'] = to_df_label_str(output[0]['token_str'])
    return df

mislabeled_df_ext = eval_on_mm(mislabeled_df)
mislabeled_df_ext.head(3)


In [None]:
print(f"""
Total examples: {mislabeled_df_ext.count()[0]}
\n
Correctly classified: {mislabeled_df_ext.query('label == predicted_label').count()[0]}
Misclassified: {mislabeled_df_ext.query('label != predicted_label').count()[0]}
\n
Correctly classified with masked model: {mislabeled_df_ext.query('label == mm_predicted_label').count()[0]}
Misclassified with masked model: {mislabeled_df_ext.query('label != mm_predicted_label').count()[0]}
\n
Equally classified with distilbeto and masked model: {mislabeled_df_ext.query('predicted_label == mm_predicted_label').count()[0]}
""")

In [None]:
mislabeled_df_ext.query('label != mm_predicted_label').head(10)

## Both predictions correct but misclassified!!!

In [None]:
mislabeled_df_ext.query('predicted_label == mm_predicted_label')

# All mispredictions

In [None]:
preds_df_ext = eval_on_mm(preds_df)
preds_df_ext.head(3)

In [None]:
print(f"""
Total examples: {preds_df_ext.count()[0]}
\n
Correctly classified: {preds_df_ext.query('label == predicted_label').count()[0]}
Misclassified: {preds_df_ext.query('label != predicted_label').count()[0]}
\n
Correctly classified with masked model: {preds_df_ext.query('label == mm_predicted_label').count()[0]}
Misclassified with masked model: {preds_df_ext.query('label != mm_predicted_label').count()[0]}
\n
Equally classified with distilbeto and masked model: {preds_df_ext.query('predicted_label == mm_predicted_label').count()[0]}
""")

# XLNI

In [None]:
xnli_pipeline = pipeline("zero-shot-classification", model="dccuchile/bert-base-spanish-wwm-uncased-finetuned-xnli")

In [None]:
def apply_dropout(m):
    if type(m) == nn.Dropout:
        # print(m)
        m.train()

xnli_pipeline.model.apply(apply_dropout)

In [None]:
all_labels=["positivo", "neutral", "negativo"]
# all_labels=["P", "NEU", "N"]

In [None]:
example=mislabeled_df.at[0, 'text']
print(mislabeled_df.at[0, 'label'])
example

In [None]:

output = xnli_pipeline(example, all_labels, multi_label=False)
output

In [None]:

output = xnli_pipeline(example, all_labels, multi_label=True)
output

In [None]:
def eval_on_zs(df, labels):
    for i in range(len(df)):
        example = df.at[i, 'text'] 
        # example = "Hola. Que tal estas?"
        output = xnli_pipeline(example, labels, multi_label=False)
        # print(output)
        # print(example)        
        df.at[i, 'zs_predicted_label'] = to_df_label_str(output['labels'][0])
    return df

In [None]:
mislabeled_zs_df_ext = eval_on_zs(mislabeled_df_ext, all_labels)

In [None]:
mislabeled_zs_df_ext.head()

In [None]:
print(f"""
Total examples: {mislabeled_zs_df_ext.count()[0]}
\n
Correctly classified: {mislabeled_zs_df_ext.query('label == predicted_label').count()[0]}
Misclassified: {mislabeled_zs_df_ext.query('label != predicted_label').count()[0]}
\n
Correctly classified with masked model: {mislabeled_zs_df_ext.query('label == mm_predicted_label').count()[0]}
Misclassified with masked model: {mislabeled_zs_df_ext.query('label != mm_predicted_label').count()[0]}
\n
Equally classified with distilbeto and masked model: {mislabeled_zs_df_ext.query('predicted_label == mm_predicted_label').count()[0]}
\n
Correctly classified with zero shot model: {mislabeled_zs_df_ext.query('label == zs_predicted_label').count()[0]}
Misclassified with zero shot model: {mislabeled_zs_df_ext.query('label != zs_predicted_label').count()[0]}
\n
Equally classified with distilbeto and zero shot model: {mislabeled_zs_df_ext.query('predicted_label == zs_predicted_label').count()[0]}
""")

In [None]:
preds_zs_df_ext = eval_on_zs(preds_df_ext, all_labels)
preds_zs_df_ext.head()

In [None]:
print(f"""
Total examples: {preds_zs_df_ext.count()[0]}

Correctly classified: {preds_zs_df_ext.query('label == predicted_label').count()[0]}
Misclassified: {preds_zs_df_ext.query('label != predicted_label').count()[0]}

Correctly classified with masked model: {preds_zs_df_ext.query('label == mm_predicted_label').count()[0]}
Misclassified with masked model: {preds_zs_df_ext.query('label != mm_predicted_label').count()[0]}

Equally classified with distilbeto and masked model: {preds_zs_df_ext.query('predicted_label == mm_predicted_label').count()[0]}

Correctly classified with zero shot model: {preds_zs_df_ext.query('label == zs_predicted_label').count()[0]}
Misclassified with zero shot model: {preds_zs_df_ext.query('label != zs_predicted_label').count()[0]}

Equally classified with distilbeto and zero shot model: {preds_zs_df_ext.query('predicted_label == zs_predicted_label').count()[0]}
Equally classified with masked model and zero shot model: {preds_zs_df_ext.query('mm_predicted_label == zs_predicted_label').count()[0]}

Correctly classified with both, masked model and zero shot model: {preds_zs_df_ext.query('label == mm_predicted_label and mm_predicted_label == zs_predicted_label').count()[0]}
""")