<a href="https://colab.research.google.com/github/nes-a/emo-tag-project/blob/main/notebooks/%20dataset_comparison_visualizations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datasets import Dataset
from transformers import AutoTokenizer
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
GOEMOTIONS_LABELS = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
    'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
    'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
    'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

Define load_and_process_go_emotions_split function

In [None]:
def load_and_process_go_emotions_split(filepath):
    try:
        # Load without header, assign names explicitly based on the structure observed
        df_split = pd.read_csv(filepath, sep='\t', encoding='utf-8', header=None, names=['text', 'emotion_ids_str', 'comment_id'])
        print(f"Loaded {filepath.split('/')[-1]}. Initial shape: {df_split.shape}")
        print(f"Columns in {filepath.split('/')[-1]} after initial load: {df_split.columns.tolist()}")

        # Convert comma-separated string of numerical IDs into a list of integers
        df_split['emotion_ids_list'] = df_split['emotion_ids_str'].apply(
            lambda x: [int(label_id) for label_id in str(x).split(',') if label_id.strip().isdigit()]
        )

        # Map these numerical IDs to the actual emotion names using GOEMOTIONS_LABELS
        df_split['emotion_names'] = df_split['emotion_ids_list'].apply(
            lambda ids: [GOEMOTIONS_LABELS[idx] for idx in ids if idx < len(GOEMOTIONS_LABELS)]
        )

        # Select only the 'text' and the new 'emotion_names' list column
        df_processed = df_split[['text', 'emotion_names']].rename(columns={'emotion_names': 'emotion'}).copy()
        print(f"Processed {filepath.split('/')[-1]}. Final shape: {df_processed.shape}")
        return df_processed
    except FileNotFoundError:
        print(f"Error: {filepath.split('/')[-1]} not found. Please ensure all 3 GoEmotions TSV files are in the specified Drive path.")
        return pd.DataFrame(columns=['text', 'emotion'])
    except Exception as e:
        print(f"An error occurred loading or processing {filepath.split('/')[-1]}: {e}")
        return pd.DataFrame(columns=['text', 'emotion'])

Define paths

In [None]:
go_emotions_data_path = "/content/drive/MyDrive/ML_Emotion_Classifier/data/go_emotions/"
go_emotions_train_filepath = f"{go_emotions_data_path}train.tsv"
go_emotions_dev_filepath = f"{go_emotions_data_path}dev.tsv"
go_emotions_test_filepath = f"{go_emotions_data_path}test.tsv"

synthetic_data_filepath = "/content/drive/MyDrive/ML_Emotion_Classifier/data/synthetic/df_synthetic_goemotions_processed.csv" # Corrected filename

Load and process GoEmotions splits

In [None]:
print("--- Loading and processing official GoEmotions splits ---")
df_go_emotions_train = load_and_process_go_emotions_split(go_emotions_train_filepath)
df_go_emotions_val = load_and_process_go_emotions_split(go_emotions_dev_filepath)
df_go_emotions_test = load_and_process_go_emotions_split(go_emotions_test_filepath)
print("--------------------------------------------------------------------------------")

Load synthetic data

In [None]:
print("--- Loading synthetic data ---")
df_synthetic = pd.read_csv(synthetic_data_filepath)

def parse_emotion_string(s):
    if pd.isna(s):
        return []
    s = str(s).strip()
    if s.startswith('[') and s.endswith(']'):
        s = s[1:-1]

    emotions = [e.strip().strip("'\"") for e in s.split(',') if e.strip()]
    return emotions

df_synthetic['emotion'] = df_synthetic['emotion'].apply(parse_emotion_string)
print(f"Loaded synthetic data. Shape: {df_synthetic.shape}")
print("--------------------------------------------------------------------------------")

Combine training data: df_go_emotions_train + df_synthetic

In [None]:
print("--- Combining datasets ---")
train_df = pd.concat([df_go_emotions_train, df_synthetic], ignore_index=True)
val_df = df_go_emotions_val # Validation set remains official dev data
test_df = df_go_emotions_test # Test set remains official test data
print(f"Combined train_df shape: {train_df.shape}")
print(f"Val_df shape: {val_df.shape}")
print(f"Test_df shape: {test_df.shape}")
print("--------------------------------------------------------------------------------")

Multi-hot encoding

In [None]:
print("--- Applying Multi-hot Encoding ---")
mlb = MultiLabelBinarizer()

# Fit on all possible labels from combined training data
mlb.fit([GOEMOTIONS_LABELS])

# Transform emotion lists into multi-hot encoded labels
train_df['multi_hot_labels'] = list(mlb.transform(train_df['emotion']))
val_df['multi_hot_labels'] = list(mlb.transform(val_df['emotion']))
test_df['multi_hot_labels'] = list(mlb.transform(test_df['emotion']))

# Convert multi_hot_labels to tensors and rename to 'labels' for Hugging Face Trainer
train_df['labels'] = train_df['multi_hot_labels'].apply(lambda x: pd.Series(x).astype(float).values)
val_df['labels'] = val_df['multi_hot_labels'].apply(lambda x: pd.Series(x).astype(float).values)
test_df['labels'] = test_df['multi_hot_labels'].apply(lambda x: pd.Series(x).astype(float).values)

print("Multi-hot encoding applied.")
print(f"Example train_df 'labels' entry: {train_df['labels'][0]}")
print(f"Number of emotion classes: {len(mlb.classes_)}")
print(f"Classes: {mlb.classes_.tolist()}")
print("--------------------------------------------------------------------------------")

Convert to Hugging Face Dataset

In [None]:
print("--- Converting to Hugging Face Dataset ---")

train_hf_dataset = Dataset.from_pandas(train_df)
val_hf_dataset = Dataset.from_pandas(val_df)
test_hf_dataset = Dataset.from_pandas(test_df)

print("Hugging Face Datasets created.")
print(f"train_hf_dataset columns: {train_hf_dataset.column_names}")
print(f"val_hf_dataset columns: {val_hf_dataset.column_names}")
print(f"test_hf_dataset columns: {test_hf_dataset.column_names}")
print("--------------------------------------------------------------------------------")

Tokenization

In [None]:
print("--- Applying Tokenization ---")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_train_dataset = train_hf_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_hf_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_hf_dataset.map(tokenize_function, batched=True)

print("--- Removing columns and setting format ---")
# Removed "__index__"
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text", "emotion", "multi_hot_labels"])
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

tokenized_val_dataset = tokenized_val_dataset.remove_columns(["text", "emotion", "multi_hot_labels"])
tokenized_val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text", "emotion", "multi_hot_labels"])
tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print("Datasets prepared with multi-hot labels for train, validation, and test.")
print("--------------------------------------------------------------------------------")

Visualize Dataset Sizes

In [None]:
visualization_output_dir = "/content/drive/MyDrive/ML_Emotion_Classifier/visualizations/" #change to own path

os.makedirs(visualization_output_dir, exist_ok=True)

print(f"Visualization output directory set to: {visualization_output_dir}")

In [None]:
print("--- Generating Dataset Size Visualization ---")
dataset_sizes = {
    "GoEmotions Train (Original)": len(df_go_emotions_train),
    "GoEmotions Validation": len(df_go_emotions_val),
    "GoEmotions Test": len(df_go_emotions_test),
    "Synthetic Data": len(df_synthetic),
    "Combined Train Data": len(train_df)
}

df_sizes = pd.DataFrame(list(dataset_sizes.items()), columns=['Dataset', 'Number of Samples'])

plt.figure(figsize=(12, 7))
sns.barplot(x='Dataset', y='Number of Samples', data=df_sizes, palette='viridis')
plt.title('Number of Samples in Each Dataset', fontsize=16)
plt.xlabel('Dataset', fontsize=12)
plt.ylabel('Number of Samples', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(visualization_output_dir, 'dataset_sizes.png'), dpi=300, bbox_inches='tight')
plt.show()
print("--------------------------------------------------------------------------------")

Visualize Emotion Distribution

In [None]:
print("--- Generating Emotion Distribution Visualizations ---")

def get_emotion_counts(df, top_n=15):
    all_emotions = [emotion for sublist in df['emotion'] for emotion in sublist]
    emotion_series = pd.Series(all_emotions)
    return emotion_series.value_counts().head(top_n)

# Get counts for each dataset
go_emotions_train_counts = get_emotion_counts(df_go_emotions_train)
synthetic_counts = get_emotion_counts(df_synthetic)
combined_train_counts = get_emotion_counts(train_df)

fig, axes = plt.subplots(3, 1, figsize=(14, 18))
fig.suptitle('Top Emotion Distribution Across Datasets', fontsize=20)

# Plot for GoEmotions Train
sns.barplot(x=go_emotions_train_counts.index, y=go_emotions_train_counts.values, ax=axes[0], palette='coolwarm')
axes[0].set_title('GoEmotions Train (Original) - Top Emotions', fontsize=14)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)

# Plot for Synthetic Data
sns.barplot(x=synthetic_counts.index, y=synthetic_counts.values, ax=axes[1], palette='coolwarm')
axes[1].set_title('Synthetic Data - Top Emotions', fontsize=14)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)

# Plot for Combined Train Data
sns.barplot(x=combined_train_counts.index, y=combined_train_counts.values, ax=axes[2], palette='coolwarm')
axes[2].set_title('Combined Train Data - Top Emotions', fontsize=14)
axes[2].set_ylabel('Frequency', fontsize=12)
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.savefig(os.path.join(visualization_output_dir, 'top_emotion_distribution.png'), dpi=300, bbox_inches='tight')
plt.show()
print("--------------------------------------------------------------------------------")

Visualize Text Length Distribution

In [None]:
print("--- Generating Text Length Distribution Visualizations ---")

# Calculate text lengths
df_go_emotions_train['text_length'] = df_go_emotions_train['text'].apply(len)
df_synthetic['text_length'] = df_synthetic['text'].apply(len)
train_df['text_length'] = train_df['text'].apply(len) # Ensure this is calculated on the combined df too

fig, axes = plt.subplots(3, 1, figsize=(14, 18))
fig.suptitle('Text Length Distribution Across Datasets', fontsize=20)

# Plot for GoEmotions Train
sns.histplot(df_go_emotions_train['text_length'], bins=50, kde=True, ax=axes[0], color='skyblue')
axes[0].set_title('GoEmotions Train (Original) - Text Length', fontsize=14)
axes[0].set_xlabel('Text Length (Characters)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)

# Plot for Synthetic Data
sns.histplot(df_synthetic['text_length'], bins=50, kde=True, ax=axes[1], color='salmon')
axes[1].set_title('Synthetic Data - Text Length', fontsize=14)
axes[1].set_xlabel('Text Length (Characters)', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)

# Plot for Combined Train Data
sns.histplot(train_df['text_length'], bins=50, kde=True, ax=axes[2], color='lightgreen')
axes[2].set_title('Combined Train Data - Text Length', fontsize=14)
axes[2].set_xlabel('Text Length (Characters)', fontsize=12)
axes[2].set_ylabel('Count', fontsize=12)
axes[2].grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.savefig(os.path.join(visualization_output_dir, 'text_length_distribution.png'), dpi=300, bbox_inches='tight')
plt.show()
print("--------------------------------------------------------------------------------")

Analyzing Increase in Underrepresented Emotions

In [None]:
print("--- Analyzing Increase in Underrepresented Emotions ---")

def get_all_emotion_occurrences(df):
    all_emotions = []
    for emotions_list in df['emotion']:
        all_emotions.extend(emotions_list)
    return pd.Series(all_emotions)

# Emotion counts for the original GoEmotions training data
original_emotion_counts = get_all_emotion_occurrences(df_go_emotions_train).value_counts()

N_LEAST_REPRESENTED = 10
least_represented_emotions = original_emotion_counts.sort_values(ascending=True).head(N_LEAST_REPRESENTED).index.tolist()

print(f"\nTop {N_LEAST_REPRESENTED} least represented emotions in original GoEmotions training data:")
print(original_emotion_counts.loc[least_represented_emotions].sort_values(ascending=True))

Emotion counts for the combined training data

In [None]:
combined_emotion_counts = get_all_emotion_occurrences(train_df).value_counts()

comparison_data = []
for emotion in least_represented_emotions:
    original_count = original_emotion_counts.get(emotion, 0)
    combined_count = combined_emotion_counts.get(emotion, 0)
    increase = combined_count - original_count
    percentage_increase = (increase / original_count * 100) if original_count > 0 else (100 if combined_count > 0 else 0)
    comparison_data.append({
        'Emotion': emotion,
        'Original Count': original_count,
        'Combined Count': combined_count,
        'Increase': increase,
        'Percentage Increase (%)': f"{percentage_increase:.2f}%"
    })

df_comparison = pd.DataFrame(comparison_data)
df_comparison = df_comparison.sort_values(by='Original Count', ascending=True)

print("\nComparison of Underrepresented Emotions (Original vs. Combined Data):")
print(df_comparison.to_string(index=False))

Visualize the increase

In [None]:
df_plot = df_comparison[['Emotion', 'Original Count', 'Combined Count']].melt(
    id_vars='Emotion', var_name='Dataset Type', value_name='Count'
)

plt.figure(figsize=(14, 8))
sns.barplot(
    x='Emotion',
    y='Count',
    hue='Dataset Type',
    data=df_plot,
    palette='pastel',
    order=df_comparison['Emotion']
)
plt.title(f'Increase in {N_LEAST_REPRESENTED} Least Represented Emotions (Original vs. Combined)', fontsize=16)
plt.xlabel('Emotion', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Dataset')
plt.tight_layout()
plt.savefig(os.path.join(visualization_output_dir, 'least_represented_emotions_increase.png'), dpi=300, bbox_inches='tight')
plt.show()

print("--------------------------------------------------------------------------------")

Analyzing All Emotion Frequencies (Original vs. Combined Data)

In [None]:
print("--- Analyzing All Emotion Frequencies (Original vs. Combined Data) ---")

def get_all_emotion_occurrences(df):
    all_emotions = []
    for emotions_list in df['emotion']:
        all_emotions.extend(emotions_list)
    return pd.Series(all_emotions)

# Get emotion counts for the original GoEmotions training data
original_emotion_counts = get_all_emotion_occurrences(df_go_emotions_train).value_counts()

# Get emotion counts for the combined training data
combined_emotion_counts = get_all_emotion_occurrences(train_df).value_counts()

# Create a DataFrame to compare counts for ALL emotions
comparison_data_all_emotions = []
for emotion in GOEMOTIONS_LABELS:
    original_count = original_emotion_counts.get(emotion, 0)
    combined_count = combined_emotion_counts.get(emotion, 0)
    increase = combined_count - original_count
    percentage_increase = (increase / original_count * 100) if original_count > 0 else (100 if combined_count > 0 else 0)
    comparison_data_all_emotions.append({
        'Emotion': emotion,
        'Original Count': original_count,
        'Combined Count': combined_count,
        'Increase': increase,
        'Percentage Increase (%)': f"{percentage_increase:.2f}%"
    })

df_comparison_all_emotions = pd.DataFrame(comparison_data_all_emotions)
df_comparison_all_emotions = df_comparison_all_emotions.sort_values(by='Original Count', ascending=True)

print("\nComparison of All Emotion Frequencies (Original vs. Combined Data):")
print(df_comparison_all_emotions.to_string(index=False))

Visualize the comparison for all emotions

In [None]:
df_plot_all_emotions = df_comparison_all_emotions[['Emotion', 'Original Count', 'Combined Count']].melt(
    id_vars='Emotion', var_name='Dataset Type', value_name='Count'
)

plt.figure(figsize=(18, 10))
sns.barplot(
    x='Emotion',
    y='Count',
    hue='Dataset Type',
    data=df_plot_all_emotions,
    palette='viridis',
    order=df_comparison_all_emotions['Emotion']
)
plt.title('All Emotion Frequencies: Original GoEmotions Train vs. Combined Train Data', fontsize=18)
plt.xlabel('Emotion', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(rotation=60, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Dataset', fontsize=10, title_fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(visualization_output_dir, 'all_emotion_frequencies.png'), dpi=300, bbox_inches='tight')
plt.show()

print("--------------------------------------------------------------------------------")