In [1]:
import pandas as pd

# Load both files
df_true = pd.read_csv('emoji_results/tweets_with_emojis_initial.csv')
df_pred = pd.read_csv('emoji_results/tweets_with_emojis_predicted.csv')

# Merge on ID
df = pd.merge(df_true[['id', 'emojis']], df_pred[['id', 'emojis']], on='id', suffixes=('_true', '_pred'))

# Prepare results list
results = []

for _, row in df.iterrows():
    id_ = row['id']
    true_set = set(str(row['emojis_true']).split())
    pred_set = set(str(row['emojis_pred']).split())

    intersection = len(true_set & pred_set)
    precision = intersection / len(pred_set) if pred_set else 0
    recall = intersection / len(true_set) if true_set else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    exact_match = int(true_set == pred_set)

    results.append({
        'id': id_,
        'true_emojis': row['emojis_true'],
        'predicted_emojis': row['emojis_pred'],
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1_score': round(f1, 2),
        'exact_match': exact_match
    })

# Create and save results DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv('emoji_comparison_results.csv', index=False)

# Summary
total = len(results_df)
exact_matches = results_df['exact_match'].sum()
print(f"Saved results to 'emoji_comparison_results.csv'")
print(f"Total rows: {total}")
print(f"Exact matches: {exact_matches} ({exact_matches / total:.2%})")
print(f"Avg Precision: {results_df['precision'].mean():.2f}")
print(f"Avg Recall: {results_df['recall'].mean():.2f}")
print(f"Avg F1 Score: {results_df['f1_score'].mean():.2f}")


Saved results to 'emoji_comparison_results.csv'
Total rows: 1000
Exact matches: 74 (7.40%)
Avg Precision: 0.67
Avg Recall: 0.54
Avg F1 Score: 0.58
