# 📁 04_clean_and_balance_labeled_data.ipynb

# 🎯 Step 4: Clean and Balance Labeled Data

"""
This notebook processes the manually labeled CSV file.
It removes rows that were left empty (i.e., unusable audio segments)
and balances the dataset across all classes.
Labels are expected to be numerical:
0: Neutral, 1: Happy, 2: Sad, 3: Angry
"""

In [None]:
# 📂 Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 📦 Libraries
import pandas as pd

In [None]:
# 📄 Path to labeled CSV
labeled_csv = "/content/drive/MyDrive/initial_labels.csv" 

In [None]:
# 📖 Load labeled data
labeled_df = pd.read_csv(labeled_csv)

print(f"📦 Total rows before cleaning: {len(labeled_df)}")

In [None]:
# 🧹 Remove rows where label is empty
cleaned_df = labeled_df[labeled_df["label"].notnull()]
cleaned_df = cleaned_df.reset_index(drop=True)

print(f"✅ Total rows after cleaning: {len(cleaned_df)}")

In [None]:
# ⚖️ Balance the dataset
min_count = cleaned_df['label'].value_counts().min()
print(f"✅ Minimum samples per class: {min_count}")

balanced_df = cleaned_df.groupby('label').sample(n=min_count, random_state=42)
balanced_df = balanced_df.reset_index(drop=True)

# 💾 Save balanced labeled data
balanced_df.to_csv("/content/drive/MyDrive/balanced_data.csv", index=False)

print(f"✅ Balanced dataset created with {len(balanced_df)} samples.")