# PART II: model comparison and validation

In [1]:
# !conda install -c conda-forge nltk -y

In [2]:
import pandas as pd
import nltk

In [3]:
train_df = pd.read_csv("train.csv")
train_df = train_df.dropna(subset=['CODE_FINAL'])
train_df = train_df.reset_index(drop=True)
test_df = pd.read_csv("test.csv") 

In [4]:
train_df

Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3,CODE_FINAL
0,672233899615850496,Piece for @TheWorldPost: 2 wars connected by c...,-1.0,-1.0,0.0,-1.0
1,672231727952072704,Soils and #Ocean Omitted From Paris #COP21 Age...,0.0,-1.0,0.0,0.0
2,672231250329919488,"To reverse the Ephedrine Disinformation,@UN mu...",0.0,1.0,1.0,1.0
3,672229503696019456,See photos &amp; highlights from #COP21 side e...,0.0,1.0,0.0,0.0
4,672229362708561921,Our own Ben Parr writes for SBS News on India'...,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...
246,671118301347618817,Hope hurricane #Gorm is the only one hitting a...,,1.0,1.0,1.0
247,671117601196511232,#cop21 stop heating our atmosphere! #climatema...,,-1.0,-1.0,-1.0
248,671117409953038336,Ten Pacific leaders to address the world to ga...,,0.0,0.0,0.0
249,671116917596377088,Intermedia is at the #COP21 https://t.co/bctTG...,,0.0,0.0,0.0


In [5]:
test_df.head()

Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3,CODE_FINAL
0,675826762543140864,@AmedeusKizito I massive milestone in helping ...,1.0,1.0,1.0,1.0
1,675826724605730816,https://t.co/hZwEOmVXpW Time now for Ireland ...,1.0,1.0,1.0,1.0
2,675826603977515008,Very proud of my country today #France #Succes...,1.0,1.0,1.0,1.0
3,675826598772248576,Brain farts of an angry and an ignorant man ht...,-1.0,-1.0,-1.0,-1.0
4,675826488755617793,I am honestly looking forward to spending a fe...,1.0,0.0,1.0,1.0


##  TASK 2.1: Approach 1: own dictionary

In [6]:
dict_df = pd.read_csv("COPSentimentDict.csv", sep=";") 
dict_df

Unnamed: 0,TERM,SENTIMENT,NOTES
0,ability,1.0,
1,acting,0.0,
2,action,1.0,
3,advice,0.0,
4,affect,1.0,
...,...,...,...
191,watch,,
192,water,,
193,well,1.0,
194,woman,,


In [7]:
dict_df = dict_df.dropna(subset=['SENTIMENT'])
dict_df = dict_df.drop_duplicates(subset=['TERM'], keep='first')
dict_df = dict_df.reset_index(drop=True)
dict_df

Unnamed: 0,TERM,SENTIMENT,NOTES
0,ability,1.0,
1,acting,0.0,
2,action,1.0,
3,advice,0.0,
4,affect,1.0,
...,...,...,...
176,updates,0.0,
177,violation,0.0,
178,violent,0.0,
179,war,-1.0,


In [8]:
nltk.download('punkt')
nltk.download('punkt_tab')

sent_dict = dict(zip(dict_df['TERM'], dict_df['SENTIMENT']))

def predict_with_dict(text):
    tokens = nltk.word_tokenize(text.lower())
    score = 0
    for t in tokens:
        if t in sent_dict:
            score += sent_dict[t]
    return 1 if score > 0 else (-1 if score < 0 else 0)

test_df['pred_dict'] = test_df['TEXT'].apply(predict_with_dict)

[nltk_data] Downloading package punkt to /Users/hwy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/hwy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [9]:
test_df.head()

Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3,CODE_FINAL,pred_dict
0,675826762543140864,@AmedeusKizito I massive milestone in helping ...,1.0,1.0,1.0,1.0,1
1,675826724605730816,https://t.co/hZwEOmVXpW Time now for Ireland ...,1.0,1.0,1.0,1.0,-1
2,675826603977515008,Very proud of my country today #France #Succes...,1.0,1.0,1.0,1.0,1
3,675826598772248576,Brain farts of an angry and an ignorant man ht...,-1.0,-1.0,-1.0,-1.0,0
4,675826488755617793,I am honestly looking forward to spending a fe...,1.0,0.0,1.0,1.0,1


## TASK 2.2: Approach 2: general dictionary

In [10]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

def predict_vader(text):
    score = sid.polarity_scores(text)['compound']
    if score > 0.05:
        return 1
    elif score < -0.05:
        return -1
    return 0

test_df['pred_vader'] = test_df['TEXT'].apply(predict_vader)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hwy/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [11]:
test_df.head()

Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3,CODE_FINAL,pred_dict,pred_vader
0,675826762543140864,@AmedeusKizito I massive milestone in helping ...,1.0,1.0,1.0,1.0,1,1
1,675826724605730816,https://t.co/hZwEOmVXpW Time now for Ireland ...,1.0,1.0,1.0,1.0,-1,1
2,675826603977515008,Very proud of my country today #France #Succes...,1.0,1.0,1.0,1.0,1,1
3,675826598772248576,Brain farts of an angry and an ignorant man ht...,-1.0,-1.0,-1.0,-1.0,0,-1
4,675826488755617793,I am honestly looking forward to spending a fe...,1.0,0.0,1.0,1.0,1,1


## TASK 2.3: Approach 3: traditional machine learning

In [12]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
train_df['TEXT'] = train_df['TEXT'].apply(preprocess)
test_df['TEXT'] = test_df['TEXT'].apply(preprocess)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer(max_features=5000) # TF-IDF
X_train = vectorizer.fit_transform(train_df['TEXT'])
y_train = train_df['CODE_FINAL']

clf = MultinomialNB()
clf.fit(X_train, y_train)

X_test = vectorizer.transform(test_df['TEXT'])
test_df['pred_ml'] = clf.predict(X_test)
test_df.head()

Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3,CODE_FINAL,pred_dict,pred_vader,pred_ml
0,675826762543140864,@user I massive milestone in helping earth #CO...,1.0,1.0,1.0,1.0,1,1,0.0
1,675826724605730816,http Time now for Ireland show leadership lik...,1.0,1.0,1.0,1.0,-1,1,0.0
2,675826603977515008,Very proud of my country today #France #Succes...,1.0,1.0,1.0,1.0,1,1,0.0
3,675826598772248576,Brain farts of an angry and an ignorant man ht...,-1.0,-1.0,-1.0,-1.0,0,-1,0.0
4,675826488755617793,I am honestly looking forward to spending a fe...,1.0,0.0,1.0,1.0,1,1,0.0


In [14]:
test_df['pred_ml'].value_counts()

pred_ml
0.0    998
1.0      2
Name: count, dtype: int64

## TASK 2.4: Approach 4: pre-trained fine-tuned transformer

In [15]:
# !conda install transformers -c conda-forge -y
# !conda install pytorch -c pytorch -y
# !conda install -c conda-forge safetensors -y
# !conda install -c conda-forge tensorflow -y
# !pip install tf-keras
# !pip install transformers==4.49

https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

This is a RoBERTa-base model trained on ~124M tweets from January 2018 to December 2021, and finetuned for sentiment analysis with the **TweetEval** benchmark. The original Twitter-based RoBERTa model can be found here and the original reference paper is TweetEval. This model is suitable for English.

In [16]:
from transformers import pipeline

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model = pipeline("sentiment-analysis", 
                 model=model_path,
                tokenizer=model_path,
                framework="pt")

def map_to_three_labels(result):
    label = result[0]['label']
    if label == "NEGATIVE":
        return -1
    elif label == "POSITIVE":
        return 1
    return 0

test_df['pred_transformer'] = test_df['TEXT'].apply(lambda x: map_to_three_labels(model(x)))

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use mps:0


In [19]:
test_df

Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3,CODE_FINAL,pred_dict,pred_vader,pred_ml,pred_transformer
0,675826762543140864,@user I massive milestone in helping earth #CO...,1.0,1.0,1.0,1.0,1,1,0.0,0
1,675826724605730816,http Time now for Ireland show leadership lik...,1.0,1.0,1.0,1.0,-1,1,0.0,0
2,675826603977515008,Very proud of my country today #France #Succes...,1.0,1.0,1.0,1.0,1,1,0.0,0
3,675826598772248576,Brain farts of an angry and an ignorant man ht...,-1.0,-1.0,-1.0,-1.0,0,-1,0.0,0
4,675826488755617793,I am honestly looking forward to spending a fe...,1.0,0.0,1.0,1.0,1,1,0.0,0
...,...,...,...,...,...,...,...,...,...,...
995,675672269394456576,USATODAY: RT EricJLyman: New #COP21 text is ou...,,,,,0,0,0.0,0
996,675672025566986240,Optimistic about #COP21 agreement but can't wo...,,,,,0,1,0.0,0
997,675671901214126080,@user Petition United Nations: Climate Denial ...,,,,,0,-1,0.0,0
998,675671892196499460,nycjim: #COP21 activists use geolocation to sp...,,,,,0,0,0.0,0


In [31]:
test_df.drop(columns=['SENTIMENT'], inplace=True)

## TASK 2.5: Comparison and model selection

In [20]:
gpt_code = pd.read_csv("gpt.tsv", sep="\t")

In [21]:
gpt_code

Unnamed: 0,ID,SENTIMENT
0,675826762543140864,1
1,675826724605730816,1
2,675826603977515008,1
3,675826598772248576,-1
4,675826488755617793,1
...,...,...
970,675672269394456576,0
971,675672025566986240,0
972,675671901214126080,1
973,675671892196499460,1


In [33]:
test_df = test_df.merge(gpt_code[["ID", "SENTIMENT"]], on="ID", how="left")
test_df.rename(columns={'SENTIMENT': 'pred_gpt'}, inplace=True)
test_df

Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3,CODE_FINAL,pred_dict,pred_vader,pred_ml,pred_transformer,pred_gpt
0,675826762543140864,@user I massive milestone in helping earth #CO...,1.0,1.0,1.0,1.0,1,1,0.0,0,1.0
1,675826724605730816,http Time now for Ireland show leadership lik...,1.0,1.0,1.0,1.0,-1,1,0.0,0,1.0
2,675826603977515008,Very proud of my country today #France #Succes...,1.0,1.0,1.0,1.0,1,1,0.0,0,1.0
3,675826598772248576,Brain farts of an angry and an ignorant man ht...,-1.0,-1.0,-1.0,-1.0,0,-1,0.0,0,-1.0
4,675826488755617793,I am honestly looking forward to spending a fe...,1.0,0.0,1.0,1.0,1,1,0.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
995,675672269394456576,USATODAY: RT EricJLyman: New #COP21 text is ou...,,,,,0,0,0.0,0,0.0
996,675672025566986240,Optimistic about #COP21 agreement but can't wo...,,,,,0,1,0.0,0,0.0
997,675671901214126080,@user Petition United Nations: Climate Denial ...,,,,,0,-1,0.0,0,1.0
998,675671892196499460,nycjim: #COP21 activists use geolocation to sp...,,,,,0,0,0.0,0,1.0


In [47]:
test_df_compare = test_df.dropna(subset=['CODE_FINAL', 'pred_gpt'])
test_df_compare

Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3,CODE_FINAL,pred_dict,pred_vader,pred_ml,pred_transformer,pred_gpt
0,675826762543140864,@user I massive milestone in helping earth #CO...,1.0,1.0,1.0,1.0,1,1,0.0,0,1.0
1,675826724605730816,http Time now for Ireland show leadership lik...,1.0,1.0,1.0,1.0,-1,1,0.0,0,1.0
2,675826603977515008,Very proud of my country today #France #Succes...,1.0,1.0,1.0,1.0,1,1,0.0,0,1.0
3,675826598772248576,Brain farts of an angry and an ignorant man ht...,-1.0,-1.0,-1.0,-1.0,0,-1,0.0,0,-1.0
4,675826488755617793,I am honestly looking forward to spending a fe...,1.0,0.0,1.0,1.0,1,1,0.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
104,675797294084587521,#Iran: The #ParisAgreement is a landmark agree...,,,0.0,0.0,1,1,0.0,0,1.0
105,675797085413900288,Analysis: The final Paris climate deal http #C...,,,0.0,0.0,0,0,0.0,0,0.0
106,675796419660406784,Can the sun cool down Earth? Do we have enough...,,,0.0,0.0,0,1,0.0,0,1.0
107,675795932315828224,Aim to peak global #GHG emissions ASAP. #Paris...,,,1.0,1.0,0,0,0.0,0,1.0


In [46]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

y_true = test_df_compare["CODE_FINAL"]

models = ["pred_dict", "pred_vader", "pred_ml", "pred_transformer", "pred_gpt"]

for m in models:
    print("=== ", m, " ===")
    print("Accuracy:", accuracy_score(y_true, test_df_compare[m]))
    print("Macro-F1:", f1_score(y_true, test_df_compare[m], average='macro'))
    print("Confusion:\n", confusion_matrix(y_true, test_df_compare[m]))
    print()


===  pred_dict  ===
Accuracy: 0.3669724770642202
Macro-F1: 0.3208857208857208
Confusion:
 [[ 4  8  2]
 [ 5 27 11]
 [16 27  9]]

===  pred_vader  ===
Accuracy: 0.6146788990825688
Macro-F1: 0.6130268199233716
Confusion:
 [[10  0  4]
 [ 4 18 21]
 [ 2 11 39]]

===  pred_ml  ===
Accuracy: 0.3944954128440367
Macro-F1: 0.18859649122807018
Confusion:
 [[ 0 14  0]
 [ 0 43  0]
 [ 0 52  0]]

===  pred_transformer  ===
Accuracy: 0.3944954128440367
Macro-F1: 0.18859649122807018
Confusion:
 [[ 0 14  0]
 [ 0 43  0]
 [ 0 52  0]]

===  pred_gpt  ===
Accuracy: 0.8165137614678899
Macro-F1: 0.8244916003536694
Confusion:
 [[13  1  0]
 [ 2 26 15]
 [ 0  2 50]]



Analysis:

Across the five models, GPT-based predictions achieve the best overall performance, with the highest accuracy (0.82) and Macro-F1 (0.82). The confusion matrix shows that GPT makes relatively few mistakes on negative and positive classes and performs strongly on the neutral class, indicating robust generalization.

VADER is the next strongest model, with an accuracy of 0.61 and Macro-F1 of 0.61. It performs reasonably well on positive and negative sentiments but struggles more with neutral cases, as shown by the higher confusion among middle-class labels.

Both ML-based and Transformer-based models collapse into predicting only the middle class, resulting in identical confusion matrices and very low Macro-F1 scores (0.19). This suggests severe class imbalance issues or insufficient training signal, leading to degenerate predictions.

The dictionary-based baseline (pred_dict) performs poorly overall (accuracy 0.37), showing the limitations of simple lexicon methods in nuanced sentiment tasks.

In summary, GPT clearly outperforms all other approaches, while VADER offers moderate performance. The ML and transformer models require further tuning to avoid prediction collapse.