# Import Libraries

In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

# Generate Sentiment

In [15]:
def preprocess(text):
    """ Preprocess text (username and link placeholders)
    
    Parameters
    ===========
    text       : str
                 text to preprocess
    
    Returns
    ==========
    preprocess :  str
                  preprocessed text
    """
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def load_models():
    """ load models, tokenizers, label mapping from huggingface hub
    
    Returns
    =========
    model   :   RobertaForSequenceClassification
                pre-trained model for sentiment classifier
    tokenizer : RobertaTokenizerFast
                tokenizer used for the model
    labels    : list
                list of classes for sentiment classifier
    """
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"

    # Load the model from your local directory
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # Download label mapping
    labels = []
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
        labels = [row[1] for row in csvreader if len(row) > 1]
        
    return model, tokenizer, labels

def generate_sentiment(text, model, tokenizer, labels):
    """ generate sentiment with confidence score given the text
    
    Parameters
    ============
    text    :   str
                input text to get sentiment
    
    Returns
    ===========
    generate_sentiment : tuple
                         label string and confidence score
    """
    # preprocess text
    text = preprocess(text)
    
    # tokenize
    encoded_input = tokenizer(text, return_tensors='pt')
    
    # get sentiments
    output = model(**encoded_input)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    
    return labels[ranking[0]], scores[ranking[0]]


In [16]:
df = pd.read_csv("sentiment_test_cases.csv")
model, tokenizer, labels = load_models()
sentiments = df["text"].progress_apply(lambda x: generate_sentiment(x, model, tokenizer, labels))


100%|████████████████████████████████████████████████████████████████████████████████| 498/498 [01:11<00:00,  6.92it/s]


In [29]:
df["model_output"] = sentiments.apply(lambda x: x[0])
df["confidence_score"] = sentiments.apply(lambda x: round(x[1] * 100, 2))
df.to_csv("output_sentiment_test.csv", index=False)

# Evaluate Accuracy with test data

In [30]:
df = pd.read_csv("output_sentiment_test.csv")

In [31]:
accuracy = round((df["expected_sentiment"] == df["model_output"]).mean(), 2)
print(f"Accuracy: {accuracy * 100}%")

Accuracy: 84.0%
