In [1]:
import pandas as pd
import numpy as np

In [2]:
# ---------------------------
# Step 1: Load and clean the data
# ---------------------------
ratings_df = pd.read_csv("raw_data/ratings_v2.csv")
audio_snippets_df = pd.read_csv("raw_data/audio_snippets_v2.csv")
attention_check_fails = pd.read_csv("raw_data/attention_checks_v2.csv")

# Display heads and lengths (for debugging purposes)
print(ratings_df.head())
print(audio_snippets_df.head())
print("Initial lengths:", len(ratings_df), len(audio_snippets_df))

# Drop rows where snippet_id is null
ratings_df = ratings_df.dropna(subset=["snippet_id"])
print("After dropping null snippet_id:", len(ratings_df))

# Drop duplicate ratings for the same snippet and prolific_id
ratings_df = ratings_df.drop_duplicates(subset=["snippet_id", "prolific_id"])
print("After dropping duplicates:", len(ratings_df))

                                     id               prolific_id  \
0  0944b194-9e71-4ab5-9438-c7ab3f326c9d  65cd505d9ba7450b9be3d6f7   
1  db01fe1c-30ac-422d-b4f9-81e5f2532d11  65cd505d9ba7450b9be3d6f7   
2  8da13785-e98e-4a20-a5a7-fe604a5d9a1b  65cd505d9ba7450b9be3d6f7   
3  0a7c3bf1-be52-4d2a-ad66-fb6137da25c7  65cd505d9ba7450b9be3d6f7   
4  98bf782a-a675-4b2e-afe8-5977b311be3e  65cd505d9ba7450b9be3d6f7   

                             snippet_id  aggression  frustration  annoyance  \
0  0e60f04a-575f-41a6-8449-6d8b3d1fab6f           3            7          7   
1  06a012e5-95c2-4ec8-9435-693b851a995b           2            8          7   
2  43081c99-a9ba-4b0a-8f8e-73a443409de4           1            1          1   
3  7d9a30f9-b288-4a6d-a3d4-302c4910c403           1            1          1   
4  e5633971-506b-4fc7-9fd2-99b848bdc0be           1            3          4   

                   created_at  
0  2025-03-18 14:54:42.046667  
1  2025-03-18 14:54:54.357149  
2  2025-03-18 

In [3]:
# Drop all ratings of users that failed the attention check
failed_users = attention_check_fails[attention_check_fails["failed"] == True]["user_id"]
print("Length of ratings_df before dropping failed users:", len(ratings_df))
ratings_df = ratings_df[~ratings_df["prolific_id"].isin(failed_users)]
print("Length of ratings_df after dropping failed users:", len(ratings_df))


Length of ratings_df before dropping failed users: 3831
Length of ratings_df after dropping failed users: 3786


In [4]:
# ---------------------------
# Step 2: Compute per-snippet statistics and normalized agreement
# ---------------------------
emotions = ["aggression", "frustration", "annoyance"]

# Calculate mean, standard deviation, and count for each emotion by snippet
agg_functions = {emotion: ["mean", "std", "count"] for emotion in emotions}
snippet_stats = ratings_df.groupby("snippet_id").agg(agg_functions)

# Flatten MultiIndex column names (e.g., "aggression_mean", "aggression_std", "aggression_count")
snippet_stats.columns = ["_".join(col).strip() for col in snippet_stats.columns.values]
snippet_stats.reset_index(inplace=True)

# For the normalized agreement, assume ratings range from 1 to 10.
min_rating, max_rating = 1, 10
max_std = np.std([min_rating, max_rating], ddof=1)  # worst-case disagreement

# Compute normalized agreement for each emotion (1 = perfect, 0 = worst)
for emotion in emotions:
    std_col = f"{emotion}_std"
    agreement_col = f"{emotion}_agreement"
    # If std is NaN (e.g. only one rating), the agreement will be left as NaN.
    snippet_stats[agreement_col] = 1 - (snippet_stats[std_col] / max_std)

# (Optional) View the snippet-level stats
print("Snippet-level statistics:")
print(snippet_stats.head())

Snippet-level statistics:
                             snippet_id  aggression_mean  aggression_std  \
0  0016c3f7-2a20-4664-bc0e-0a4625a0c782         2.000000        0.816497   
1  00453e9c-ad07-4fc9-8598-3595da3595bc         1.333333        0.577350   
2  007bd96e-e95b-461d-bb6b-897cad86f8e7         2.000000        2.000000   
3  008c9118-f8a0-4e0e-86fd-285568bdb3d8         6.000000        2.828427   
4  0093ef4f-f12c-458a-8ffb-284a465efa4e         5.666667        2.081666   

   aggression_count  frustration_mean  frustration_std  frustration_count  \
0                 4          4.000000         2.160247                  4   
1                 3          2.333333         2.309401                  3   
2                 4          1.250000         0.500000                  4   
3                 4          6.750000         1.500000                  4   
4                 3          4.666667         1.527525                  3   

   annoyance_mean  annoyance_std  annoyance_count  agg

In [5]:
# ---------------------------
# Step 3: Define a function to drop the lowest 10% based on agreement
# ---------------------------
def drop_low_agreement(df, emotion, percentile=10):
    """
    Drops the lowest <percentile>% of snippets based on the normalized agreement for the given emotion.

    Parameters:
      df: DataFrame that contains snippet-level stats including the '{emotion}_agreement' column.
      emotion: The emotion for which to filter (e.g., 'aggression').
      percentile: The cutoff percentile (default is 10, meaning drop bottom 10%).

    Returns:
      Filtered DataFrame with only snippets above the cutoff.
    """
    agreement_col = f"{emotion}_agreement"
    # Calculate the cutoff value (lowest 10% agreement)
    cutoff = np.percentile(df[agreement_col].dropna(), percentile)
    # Keep only snippets with agreement above the cutoff
    filtered_df = df[df[agreement_col] > cutoff].copy()
    return filtered_df

In [6]:
# ---------------------------
# Step 4: Create individual emotion DataFrames, merge with audio snippets, and store results
# ---------------------------
emotion_dfs = {}
for emotion in emotions:
    # Filter out snippets with very low agreement for this emotion
    filtered_stats = drop_low_agreement(snippet_stats.copy(), emotion, percentile=10)

    # Create a temporary DataFrame with snippet_id, mean rating, and agreement for this emotion
    temp_df = filtered_stats[
        ["snippet_id", f"{emotion}_mean", f"{emotion}_agreement"]
    ].copy()
    temp_df.rename(
        columns={f"{emotion}_mean": "rating", f"{emotion}_agreement": "agreement"},
        inplace=True,
    )

    # Merge with the audio snippets DataFrame.
    # Note: ratings_df uses 'snippet_id' and audio_snippets_df uses 'id' for the snippet identifier.
    merged_df = pd.merge(
        temp_df, audio_snippets_df, left_on="snippet_id", right_on="id", how="inner"
    )

    # Retain only the file_path from audio_snippets and the average rating.
    final_df = merged_df[["file_path", "rating"]].copy()
    emotion_dfs[emotion] = final_df

    # Save the final DataFrame for this emotion to a CSV file.
    output_filename = f"ratings/{emotion}_ratings.csv"
    final_df.to_csv(output_filename, index=False)
    print(f"Saved {emotion} ratings to {output_filename}")

# (Optional) Print a preview of the final DataFrames
for emotion, df in emotion_dfs.items():
    print(f"\nEmotion: {emotion}")
    print(len(df))
    print(df.head())

Saved aggression ratings to ratings/aggression_ratings.csv
Saved frustration ratings to ratings/frustration_ratings.csv
Saved annoyance ratings to ratings/annoyance_ratings.csv

Emotion: aggression
979
                        file_path    rating
0   actor1_call_10_sentence_9.wav  2.000000
1   actor2_call_32_sentence_3.wav  1.333333
2   actor4_call_22_sentence_2.wav  2.000000
3   actor4_call_25_sentence_7.wav  6.000000
4  actor3_call_40_sentence_13.wav  5.666667

Emotion: frustration
979
                        file_path    rating
0   actor1_call_10_sentence_9.wav  4.000000
1   actor2_call_32_sentence_3.wav  2.333333
2   actor4_call_22_sentence_2.wav  1.250000
3   actor4_call_25_sentence_7.wav  6.750000
4  actor3_call_40_sentence_13.wav  4.666667

Emotion: annoyance
979
                        file_path    rating
0   actor1_call_10_sentence_9.wav  4.500000
1   actor2_call_32_sentence_3.wav  2.000000
2   actor4_call_22_sentence_2.wav  1.000000
3   actor4_call_25_sentence_7.wav  9.250000


In [7]:
# ---------------------------
# Step 5: Create binary classification DataFrames per emotion
# ---------------------------
binary_emotion_dfs = {}
for emotion, df in emotion_dfs.items():
    # Create a copy and binarize the rating (ratings > 5 become 1, otherwise 0)
    df_binary = df.copy()
    df_binary["binary_rating"] = (df_binary["rating"] > 5).astype(int)
    # Drop the original ordinal rating, keeping only file_path and binary_rating
    df_binary = df_binary[["file_path", "binary_rating"]]
    binary_emotion_dfs[emotion] = df_binary

    # Save the binary DataFrame for this emotion
    binary_output_filename = f"ratings/{emotion}_binary_ratings.csv"
    df_binary.to_csv(binary_output_filename, index=False)
    print(f"Saved {emotion} binary ratings to {binary_output_filename}")

# Optional: Preview the binary DataFrame for one emotion (e.g., aggression)
print("\nPreview of binary ratings for 'aggression':")
print(binary_emotion_dfs["aggression"].head())

Saved aggression binary ratings to ratings/aggression_binary_ratings.csv
Saved frustration binary ratings to ratings/frustration_binary_ratings.csv
Saved annoyance binary ratings to ratings/annoyance_binary_ratings.csv

Preview of binary ratings for 'aggression':
                        file_path  binary_rating
0   actor1_call_10_sentence_9.wav              0
1   actor2_call_32_sentence_3.wav              0
2   actor4_call_22_sentence_2.wav              0
3   actor4_call_25_sentence_7.wav              1
4  actor3_call_40_sentence_13.wav              1
