In [1]:
import pandas as pd

In [2]:
input_csv = "data/processed/dataset_final_selected_processed copy.csv"
df = pd.read_csv(input_csv)

In [3]:
# 1. Confusion matrix for sentiment (old vs new)
sentiment_confusion = pd.crosstab(df['Sentiment'], df['absa_overall_sentiment'])
print("Old vs. new sentiment confusion matrix:")
print(sentiment_confusion)

Old vs. new sentiment confusion matrix:
absa_overall_sentiment  Negative  Neutral  Positive
Sentiment                                          
Negative                    3410      318       272
Positive                     524      348      7128


In [4]:
# 2. Confusion matrix for emotion (old vs new)
emotion_confusion = pd.crosstab(df['Emotion'], df['absa_overall_emotion'])
print("\nOld vs. new emotion confusion matrix:")
print(emotion_confusion)


Old vs. new emotion confusion matrix:
absa_overall_emotion  Anger  Fear  Happy  Love  Sadness
Emotion                                                
Anger                   919     2     26     1      115
Fear                    254    27     90     0       53
Happy                   403    11   5341   206      209
Love                     51     2   1636   124       17
Sadness                1634    15    272     4      588


In [5]:
# 3. Add match/mismatch columns
# Sentiment match: case-insensitive comparison; Neutral does not match Positive/Negative
df['sentiment_comparison'] = df.apply(
    lambda row: 'match'
    if isinstance(row['Sentiment'], str)
       and isinstance(row['absa_overall_sentiment'], str)
       and row['Sentiment'].strip().lower() == row['absa_overall_sentiment'].strip().lower()
    else 'mismatch',
    axis=1,
)

In [6]:
# Emotion match: case-insensitive comparison
df['emotion_comparison'] = df.apply(
    lambda row: 'match'
    if isinstance(row['Emotion'], str)
       and isinstance(row['absa_overall_emotion'], str)
       and row['Emotion'].strip().lower() == row['absa_overall_emotion'].strip().lower()
    else 'mismatch',
    axis=1,
)

In [7]:
# 4. Inspect match/mismatch counts
print("\nSentiment match/mismatch:")
print(df['sentiment_comparison'].value_counts())
print("\nEmotion match/mismatch:")
print(df['emotion_comparison'].value_counts())

# 5. Save the updated DataFrame with the new columns
df.to_csv("dataset_with_annotation_comparison.csv", index=False)



Sentiment match/mismatch:
sentiment_comparison
match       10538
mismatch     1462
Name: count, dtype: int64

Emotion match/mismatch:
emotion_comparison
match       6999
mismatch    5001
Name: count, dtype: int64
