<a href="https://colab.research.google.com/github/maria-saif20/-RoBERTa-emotional-model/blob/main/roberta_emotional_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install datasets transformers pandas matplotlib tqdm --upgrade --quiet

In [2]:
import datasets
from transformers import pipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# Load the dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset("AnonymousSub/recipe_RL_data_roberta-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/55 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/56 [00:00<?, ?it/s]

# **We just want the dataset test split here for evaluation**

In [4]:
split_name = "test"

dataset_name, dataset_config_name = "go_emotions", "simplified"
dataset_dict = datasets.load_dataset(dataset_name, dataset_config_name)
dataset_dict[split_name][0]

{'text': 'I’m really sorry about your situation :( Although I love the names Sapphira, Cirilla, and Scarlett!',
 'labels': [25],
 'id': 'eecwqtt'}

In [5]:
labels = dataset_dict[split_name].features["labels"].feature.names
print({i: l for i, l in enumerate(labels)})

{0: 'admiration', 1: 'amusement', 2: 'anger', 3: 'annoyance', 4: 'approval', 5: 'caring', 6: 'confusion', 7: 'curiosity', 8: 'desire', 9: 'disappointment', 10: 'disapproval', 11: 'disgust', 12: 'embarrassment', 13: 'excitement', 14: 'fear', 15: 'gratitude', 16: 'grief', 17: 'joy', 18: 'love', 19: 'nervousness', 20: 'optimism', 21: 'pride', 22: 'realization', 23: 'relief', 24: 'remorse', 25: 'sadness', 26: 'surprise', 27: 'neutral'}


# Organize the target data into arrays based on labels, as we want to evaluate each label separately in a dataset with multiple labels and classes.

In [6]:
num_items, num_labels = len(dataset_dict[split_name]), len(labels)
y_targets_all = np.zeros((num_items, num_labels), dtype=int)
for i, labels_indices in enumerate(dataset_dict[split_name]["labels"]):
    for label_index in labels_indices:
        y_targets_all[i, label_index] = 1

print(y_targets_all[0:3])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


# Load the model and run it
Loading in a multi-label, multi-class classifier model based on Roberta-base

In [7]:
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)



In [8]:
model_outputs = classifier(dataset_dict[split_name]["text"])

print(dataset_dict[split_name]["text"][0])
print(model_outputs[0])

I’m really sorry about your situation :( Although I love the names Sapphira, Cirilla, and Scarlett!
[{'label': 'remorse', 'score': 0.678300678730011}, {'label': 'sadness', 'score': 0.5406180024147034}, {'label': 'love', 'score': 0.3608132600784302}, {'label': 'caring', 'score': 0.05576622858643532}, {'label': 'disappointment', 'score': 0.036921314895153046}, {'label': 'approval', 'score': 0.03364568203687668}, {'label': 'gratitude', 'score': 0.031087186187505722}, {'label': 'admiration', 'score': 0.018729250878095627}, {'label': 'grief', 'score': 0.018079761415719986}, {'label': 'optimism', 'score': 0.015966985374689102}, {'label': 'disapproval', 'score': 0.015757175162434578}, {'label': 'realization', 'score': 0.013122462667524815}, {'label': 'desire', 'score': 0.011346070095896721}, {'label': 'curiosity', 'score': 0.010657965205609798}, {'label': 'confusion', 'score': 0.010225102305412292}, {'label': 'anger', 'score': 0.008085090667009354}, {'label': 'embarrassment', 'score': 0.00747

In [9]:
y_probas_all = np.zeros((num_items, num_labels), dtype=float)
for i, item_probas in enumerate(model_outputs):
    for item_proba in item_probas:
        label, score = item_proba["label"], item_proba["score"]
        label_index = labels.index(label)
        y_probas_all[i, label_index] = score

In [10]:
i = 3856
print(dataset_dict[split_name][i])
print(np.round(y_probas_all[i], 3).tolist())
top = np.argmax(y_probas_all[i])
print(f"Top label proba is label number {top} ({labels[top]}): {y_probas_all[i][top]}")

{'text': 'Damn, still hoping...', 'labels': [20], 'id': 'edm34qc'}
[0.002, 0.003, 0.008, 0.022, 0.011, 0.015, 0.002, 0.002, 0.052, 0.01, 0.01, 0.002, 0.001, 0.002, 0.002, 0.005, 0.001, 0.002, 0.002, 0.001, 0.824, 0.001, 0.007, 0.001, 0.003, 0.003, 0.002, 0.157]
Top label proba is label number 20 (optimism): 0.8238561749458313


# Evalutation

In [11]:
from sklearn import metrics
threshold = 0.5
y_preds_all = y_probas_all > threshold

print("Overall (macro)")
print("===============")
print(f"Accuracy: {metrics.accuracy_score(y_targets_all, y_preds_all):.3f}")
print(f"Precision: {metrics.precision_score(y_targets_all, y_preds_all, average='macro', zero_division=0):.3f}")
print(f"Recall: {metrics.recall_score(y_targets_all, y_preds_all, average='macro', zero_division=0):.3f}")
print(f"F1: {metrics.f1_score(y_targets_all, y_preds_all, average='macro', zero_division=0):.3f}")

Overall (macro)
Accuracy: 0.474
Precision: 0.575
Recall: 0.396
F1: 0.450


In [12]:
def calc_label_metrics(label, y_targets, y_preds, threshold):
    return {
        "label": label,
        "accuracy": metrics.accuracy_score(y_targets, y_preds),
        "precision": metrics.precision_score(y_targets, y_preds, zero_division=0),
        "recall": metrics.recall_score(y_targets, y_preds, zero_division=0),
        "f1": metrics.f1_score(y_targets, y_preds, zero_division=0),
        "mcc": metrics.matthews_corrcoef(y_targets, y_preds),
        "support": y_targets.sum(),
        "threshold": threshold,
    }

In [13]:
threshold = 0.5
y_preds_all = (y_probas_all > threshold).astype(int)

results = []
for label_index, label in enumerate(labels):
    y_targets, y_preds = y_targets_all[:, label_index], y_preds_all[:, label_index]
    results.append(calc_label_metrics(label, y_targets, y_preds, threshold))

per_label_results = pd.DataFrame(results, index=labels)
display(per_label_results.drop(columns=["label"]).round(3))

Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.946,0.725,0.675,0.699,0.67,504,0.5
amusement,0.982,0.79,0.871,0.829,0.821,264,0.5
anger,0.97,0.652,0.379,0.479,0.483,198,0.5
annoyance,0.94,0.472,0.159,0.238,0.25,320,0.5
approval,0.942,0.609,0.302,0.404,0.403,351,0.5
caring,0.973,0.448,0.319,0.372,0.364,135,0.5
confusion,0.972,0.5,0.431,0.463,0.45,153,0.5
curiosity,0.95,0.537,0.356,0.428,0.412,284,0.5
desire,0.987,0.63,0.41,0.496,0.502,83,0.5
disappointment,0.974,0.625,0.199,0.302,0.343,151,0.5


In [14]:
def dataset_wide_metrics(df):
    simple_mean = {
        m: round(df[m].mean(), 3)
        for m in ["precision", "recall", "f1", "mcc"]
    }
    print("Simple mean of labels:", simple_mean)
    weighted = {
        m: round(sum(df[m] * df["support"]) / df["support"].sum(), 3)
        for m in ["precision", "recall", "f1", "mcc"]
    }
    print("Weighted average (using support):", weighted)
    return simple_mean, weighted

_ = dataset_wide_metrics(per_label_results)

Simple mean of labels: {'precision': 0.575, 'recall': 0.396, 'f1': 0.45, 'mcc': 0.451}
Weighted average (using support): {'precision': 0.662, 'recall': 0.511, 'f1': 0.564, 'mcc': 0.519}


In [15]:
threshold_results = {}
for t in tqdm(range(5, 100, 5)):
    threshold = t / 100
    y_preds_all = (y_probas_all > threshold).astype(int)
    threshold_results[threshold] = []
    for label_index, label in enumerate(labels):
        y_targets, y_preds = y_targets_all[:, label_index], y_preds_all[:, label_index]
        threshold_results[threshold].append(calc_label_metrics(label, y_targets, y_preds, threshold))

100%|██████████| 19/19 [00:08<00:00,  2.17it/s]


In [16]:
metric_name = "f1"
best = {label: {metric_name: -1, "result": None} for label in labels}
for threshold, results in threshold_results.items():
    for result in results:
        label = result["label"]
        if result[metric_name] > best[label][metric_name]:
            best[label] = {metric_name: result[metric_name], "result": result}

results = [b["result"] for b in best.values()]
per_label_threshold_results = pd.DataFrame(results, index=[result["label"] for result in results])
display(per_label_threshold_results.drop(columns=["label"]).round(3))

Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.94,0.651,0.776,0.708,0.678,504,0.25
amusement,0.982,0.781,0.89,0.832,0.825,264,0.45
anger,0.959,0.454,0.601,0.517,0.502,198,0.15
annoyance,0.864,0.243,0.619,0.349,0.328,320,0.1
approval,0.926,0.432,0.442,0.437,0.397,351,0.3
caring,0.972,0.426,0.385,0.405,0.391,135,0.4
confusion,0.974,0.548,0.412,0.47,0.462,153,0.55
curiosity,0.943,0.473,0.711,0.568,0.552,284,0.25
desire,0.985,0.518,0.53,0.524,0.516,83,0.25
disappointment,0.974,0.562,0.298,0.39,0.398,151,0.4
