## Emotion_verses_labeled_N.csv

In [10]:
import pandas as pd

CSV_PATH = "verses_parsed/emotion_verses_labeled_6.csv"  # Adjust the path if necessary

df = pd.read_csv(CSV_PATH)

# Clean and normalize the labels to lowercase and remove spaces
df['label'] = df['label'].astype(str).str.strip().str.lower()

# Show the count of unique labels
label_counts = df['label'].value_counts()
print("Count of emotions labeled by GPT:")
print(label_counts)
print("\nTotal emotions:", label_counts.sum())
print("Unique emotions found:", list(label_counts.index))


Count of emotions labeled by GPT:
label
neutral        2266
joy            1307
anger           830
sadness         618
fear            434
surprise        267
disgust         207
trust            37
hope             17
peace             4
confidence        2
confusion         1
comfort           1
concern           1
shame             1
gratitude         1
anguish           1
plea              1
humility          1
desperation       1
Name: count, dtype: int64

Total emotions: 5999


## Adjusting wrong emotions labeled by GPT

In [11]:
import pandas as pd

CSV_PATH = "verses_parsed/emotion_verses_labeled_6.csv"  # Adjust if necessary

# Official list of emotions (in lowercase, as per your model)
EMOTIONS_MAIN = ["joy", "sadness", "anger", "fear", "surprise", "neutral", "disgust"]

# Mapping strange emotions to main ones
label_corrections = {
    "hope": "joy",
    "trust": "neutral",
    "shame": "sadness",
    "confidence": "joy",
    "compassion": "joy",
    "courage": "joy",
    "peace": "neutral",
    "respect": "neutral",
    "guilt": "sadness",
    "gratitude": "joy",
    "confusion": "neutral",
    "anticipation": "joy"
}

# Load and clean
df = pd.read_csv(CSV_PATH)
df['label'] = df['label'].astype(str).str.strip().str.lower()
df['label'] = df['label'].replace(label_corrections)

# Filter only valid emotions (discard any others that slipped in)
df = df[df['label'].isin(EMOTIONS_MAIN)].copy()

# Optional: show the new count
print("Emotions after remapping:")
print(df['label'].value_counts())

# Save the cleaned file (overwrite or save a new one)
df.to_csv(CSV_PATH, index=False)
print(f"✅ Cleaned file saved: {CSV_PATH}")


Emotions after remapping:
label
neutral     2308
joy         1327
anger        830
sadness      619
fear         434
surprise     267
disgust      207
Name: count, dtype: int64
✅ Cleaned file saved: verses_parsed/emotion_verses_labeled_6.csv


## Merge all emotion_verses_labeled into a combined one

In [12]:
import pandas as pd
from pathlib import Path

# Input and output file paths
CSV1 = Path("verses_parsed/emotion_verses_labeled.csv")  # Change names if necessary
CSV2 = Path("verses_parsed/emotion_verses_labeled_2.csv")
CSV3 = Path("verses_parsed/emotion_verses_labeled_3.csv")
CSV4 = Path("verses_parsed/emotion_verses_labeled_4.csv")
CSV5 = Path("verses_parsed/emotion_verses_labeled_5.csv")
CSV6 = Path("verses_parsed/emotion_verses_labeled_6.csv")

OUTPUT = Path("verses_parsed/emotion_verses_labeled_combined.csv")

# Load both files
df1 = pd.read_csv(CSV1)
df2 = pd.read_csv(CSV2)
df3 = pd.read_csv(CSV3)
df4 = pd.read_csv(CSV4)
df5 = pd.read_csv(CSV5)
df6 = pd.read_csv(CSV6)

print(f"CSV1: {len(df1)} rows")
print(f"CSV2: {len(df2)} rows")
print(f"CSV3: {len(df3)} rows")
print(f"CSV4: {len(df4)} rows")
print(f"CSV5: {len(df5)} rows")
print(f"CSV5: {len(df6)} rows")


# Concatenate
df_combined = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)

# Remove duplicates by verse_id (or by 'id' if you prefer)
df_combined = df_combined.drop_duplicates(subset=['verse_id']).reset_index(drop=True)
print(f"Combined: {len(df_combined)} rows after removing duplicates by verse_id")

# (Optional) Renumber 'id' if the column exists
if 'id' in df_combined.columns:
    df_combined['id'] = range(len(df_combined))

# Save the result
OUTPUT.parent.mkdir(parents=True, exist_ok=True)
df_combined.to_csv(OUTPUT, index=False)
print(f"✅ Combined file saved to: {OUTPUT.absolute()}")


CSV1: 981 rows
CSV2: 1000 rows
CSV3: 1000 rows
CSV4: 1000 rows
CSV5: 1000 rows
CSV5: 5992 rows
Combined: 9751 rows after removing duplicates by verse_id
✅ Combined file saved to: c:\Users\manue\my_projects\LinguaAnimae\data\evaluation\verses_parsed\emotion_verses_labeled_combined.csv


In [14]:
import pandas as pd

# Path to the combined file
csv_path = "verses_parsed/emotion_verses_labeled_combined.csv"

# Load the file
df = pd.read_csv(csv_path)

# Clean and normalize the labels
df['label'] = df['label'].astype(str).str.strip().str.lower()

# Show the count of unique labels
label_counts = df['label'].value_counts()
print("Emotion counts in the combined dataset:")
print(label_counts)
print("\nTotal verses:", label_counts.sum())
print("Unique emotions found:", list(label_counts.index))


Emotion counts in the combined dataset:
label
neutral     3464
joy         2132
anger       1274
sadness     1119
fear         822
surprise     548
disgust      392
Name: count, dtype: int64

Total verses: 9751
Unique emotions found: ['neutral', 'joy', 'anger', 'sadness', 'fear', 'surprise', 'disgust']
