<a href="https://colab.research.google.com/github/ntxuan1799/Aboriginal-Project/blob/main/emotionovertime_referendum2023_socialmedia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
import urllib.request
from tqdm.notebook import tqdm

In [None]:
df = pd.read_csv("lda_with_topic_labels.csv")

In [None]:
# Load model and tokenizer
task = 'emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Download emotion label mappings
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    labels = [row.split('\t')[1] for row in html if '\t' in row]

# Emotion prediction function
def get_emotion(text):
    if not text or pd.isna(text):
        return ("uncategorized", 0.0)

    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    probs = softmax(scores)
    top_idx = np.argmax(probs)
    top_label = labels[top_idx]
    top_score = float(np.round(probs[top_idx], 4))
    return (top_label, top_score)

# Apply to dataset
tqdm.pandas()
df[['sentiment_result', 'sentiment_score']] = df['combined_text'].progress_apply(get_emotion).apply(pd.Series)

# Optional: filter out uncategorized if needed
df = df[df['sentiment_result'].str.lower() != 'uncategorized']

  0%|          | 0/2498 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


In [None]:
df.to_csv('referendum_sentiment_topics.csv', index=False)