# Notebook

In this notebook, we'll be experimenting with re-purposing many text-generative models for our multi-class text classification task. We'll be using the `transformers` library by Hugging Face to load the models. Note that the Meta's llama model is quite large and requires an access token. You can get one by signing up on the Hugging Face website and then using the `transformers-cli` to download the model.

In [None]:
from transformers import pipeline, set_seed, AutoModel, AutoTokenizer
import transformers
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
import torch

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
df = pd.read_pickle("data/data_original.pkl")

down_sample_percentage = 0.01

# down sample data
df = df.sample(frac=down_sample_percentage/100, random_state=1)

# rename emotions to label and map to integers
df.rename(columns={'emotions':'label'}, inplace = True)
label_map = {"sadness": 0, "joy": 1, "love": 2, "anger": 3, "fear": 4, "surprise": 5} 
df['label'] = df['label'].map(label_map)

print(df.shape)
df.head()

In [None]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)

# using prompt engineering to generate a template for the model to fill in with the predicted emotion
def guessEmotion(phrase):
    emotions = ["sadness", "joy", "love", "anger", "fear", "surprise"]
    
    # Initialize the generator with pad_token_id set to eos_token_id to suppress the warning
    generator = pipeline('text-generation', model='gpt2', pad_token_id=50256)
    set_seed(42)
    
    examples = [
        ("This is such a frustrating day", "anger"),
        ("I am so scared of the dark", "fear"),
        ("I am so happy today", "joy"),
        ("He is the love of my life", "love"),
        ("Yesterday was a pretty bad day because my cat died", "sadness"),
        ("I can't believe my eyes", "surprise")
    ]
    
    # Create the template with examples
    template = "Of the 6 emotions (sadness, joy, love, anger, fear, surprise), identify the most prevalent one in each of the following sentences:\n"
    for (example, emotion) in examples:
        template += f"{example}: {emotion}\n"
    template += f"{phrase}:"
    
    # Generate the response with adjusted parameters
    generated_text = generator(template, max_new_tokens=1, num_return_sequences=1, temperature=0.8, top_p=0.9)[0]['generated_text']
    
    # Analyze the generated text to determine the predicted emotion
    response = generated_text[len(template):].strip().lower()
    for emotion in emotions:
        if emotion in response:
            return emotion
    
    return "Unknown"

def show_cm(cm, classes, figsize=(10, 10)):
    # cm = np.array([[TP, FP], [FN, TN]])
    plt.figure(figsize=figsize)
    plt.imshow(cm)
    plt.suptitle('Confusion matrix')
    total = sum(sum(cm))
    plt.title('Total cases: {}'.format(total))
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    for i in range(len(classes)):
        for j in range(len(classes)):
            perc = round(cm[i, j] / total * 100, 1)
            plt.text(j, i, f"{format(cm[i, j], '.0f')} : {perc}%", horizontalalignment="center",
                     color="black" if cm[i, j] > cm.max() / 2 else "white")

    plt.show()


In [None]:
print(guessEmotion("i feel so enraged but helpless at the same time"))
print(guessEmotion("i feel sickened by and disgusted with the sins of man despite my divinity i feel sickened by and disgusted with the sins of man"))
print(guessEmotion("i feel especially pleased about this as this has been a long time coming"))
print(guessEmotion("i feel to glad that this blog must be helpful knowledgeable and explorabe"))
print(guessEmotion("i feel a lil dazed actually"))
print(guessEmotion("i feel i am i am utterly amazed at my complete lack of savvy when it comes to certain situations"))
print(guessEmotion("i absolutely love her and feel accepted by her at any weight"))
print(guessEmotion("i can feel your tender lips making me feel alright"))
print(guessEmotion("i feel regretful that i have never said i love you to him"))
print(guessEmotion("i feel a sense of melancholy at this time of year"))
print(guessEmotion("i feel shaky if i dont eat i continually think about food and what im eating and when i get to eat next"))
print(guessEmotion("i admit that i feel a little neurotic about that part i post"))
print(guessEmotion("i loved how well stephanie balanced annas homesickness with her excitement to explore a new city she is the type of character that i find myself becoming connected too when she is embarrassed i feel embarrassed right along with her and that is a credit to stephanie to create that type of connection"))

In [None]:
df['predicted'] = df['text'].apply(guessEmotion) # apply the guessEmotion function to the text column

In [None]:
print(f"Number of 'Unknown' predictions: {df[df['predicted'] == 'Unknown'].shape[0]}")

In [None]:
df.head(20)

In [None]:
# add unknown to label_map
label_map["Unknown"] = 6
df['predicted'] = df['predicted'].map(label_map)

In [None]:
# compare predicted and true labels, calculate accuracy, precision, recall, f1 and confusion matrix
accuracy = accuracy_score(df['label'], df['predicted'])
precision = precision_score(df['label'], df['predicted'], average='macro')
recall = recall_score(df['label'], df['predicted'], average='macro')
f1 = f1_score(df['label'], df['predicted'], average='macro')
confusion = confusion_matrix(df['label'], df['predicted'])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")
show_cm(confusion, ["sadness", "joy", "love", "anger", "fear", "surprise"])

## Meta's Llama

In [None]:
model_name = "meta-llama/meta-Llama-3-8B"
token = "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" # Token is deactivated, please use your own token

model = AutoModel.from_pretrained(model_name, use_auth_token=token)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B"
pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto", max_new_tokens=15)
pipeline('''Only consider these 6 emotions (sadness, joy, love, anger, fear, surprise), identify the most prevalent one in the following sentence:
i feel so enraged but helpless at the same time
''')