In [1]:
import os
import pandas as pd
import numpy as np
import yaml


quantiles_thresholds = [.2, .4, .6, .8] # for binning the values into 5 categories

mm_ratings_df = pd.read_csv("mm_ratings.csv")
mf_ratings_df = pd.read_csv("mf_ratings.csv")
groundtruth_df = pd.read_csv("../files/groundtruth.csv", index_col="stimulus_id")

for ratings_df in [mm_ratings_df, mf_ratings_df]:
    ratings_df = ratings_df.drop(["prolific_id"], axis=1)
    ratings_means_df = ratings_df.groupby('stimulus_id').mean()
    # merge with groundtruth
    groundtruth_df = pd.merge(groundtruth_df, ratings_means_df, left_index=True, right_index=True)

# drop these columns
drop_these = [ 
    'asian','black', 'white', 'other', 'title', 'description', 'upload_date',
    'duration', 'view_count', 'categories', 'tags', 'like_count','requested_subtitles', 
    'download', 'error_logs',
]

groundtruth_df = groundtruth_df.drop(drop_these, axis=1)

# bin the values to 5 categories
for column in groundtruth_df.columns:
    if groundtruth_df[column].dtype == 'float64':
        quantiles = groundtruth_df[column].quantile(quantiles_thresholds).tolist() 
        groundtruth_df[column] = pd.cut(
            groundtruth_df[column], 
            bins=[-np.inf, *quantiles, np.inf],
            labels=[0, 1, 2, 3, 4]
        )
        groundtruth_df[column] = groundtruth_df[column].astype('int64')

# rename column "all_genders" to "target_of_toy_ad"
groundtruth_df = groundtruth_df.rename(columns={"all_genders": "target_of_toy_ad"})

print(groundtruth_df.columns)
display(groundtruth_df.head())

# save the groundtruth
groundtruth_df.to_csv('music_groundtruth_5bins.csv')

Index(['product_category', 'filming_location', 'target_of_toy_ad',
       'interaction', 'voice_type', 'voice_age', 'voice_gender', 'voice_exagg',
       'Happy', 'Amusing', 'Beauty', 'Calm', 'Energizing', 'Angry',
       'Triumphant', 'Electric/Acoustic', 'Distorted/Clear',
       'Many Instruments/Few Instruments', 'Loud/Soft', 'Heavy/Light',
       'High pitch/Low pitch', 'Wide pitch variation/Narrow pitch variation',
       'Punchy/Smooth', 'Harmonious/Disharmonious', 'Clear melody/No melody',
       'Repetitive/Non-repetitive', 'Complex rhythm/Simple rhythm',
       'Fast tempo/Slow tempo', 'Dense/Sparse', 'Strong beat/Weak beat'],
      dtype='object')


Unnamed: 0_level_0,product_category,filming_location,target_of_toy_ad,interaction,voice_type,voice_age,voice_gender,voice_exagg,Happy,Amusing,...,High pitch/Low pitch,Wide pitch variation/Narrow pitch variation,Punchy/Smooth,Harmonious/Disharmonious,Clear melody/No melody,Repetitive/Non-repetitive,Complex rhythm/Simple rhythm,Fast tempo/Slow tempo,Dense/Sparse,Strong beat/Weak beat
stimulus_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ndzo2ZIWfiQ,High-tech Interactive Playmates and Robotics,Non-specific,Girls/women,They do not interact with each other or there ...,BOTH spoken and sung,Adults (including young adults),Feminine,"No, all voices are normal-sounding",4,1,...,2,3,3,1,2,3,3,2,1,3
yRUiwRKk6QM,High-tech Interactive Playmates and Robotics,Indoors,Mixed,They do not interact with each other or there ...,Spoken,Adults (including young adults),Masculine,Yes a masculine voice is gender exaggerated,0,3,...,3,0,1,3,4,4,0,4,0,4
3ysC1-foJT4,"Apparel, Fashion, Accessories, Cosmetics, Cost...",Indoors,Girls/women,They are working or playing together in a coop...,Sung,Adults (including young adults),Feminine,Yes a feminine voice is gender exaggerated,2,2,...,3,2,1,2,2,0,1,1,2,0
cYszuGaptkk,"Action Figures, Battling Toys and Toy Weapons",Non-specific,Mixed,They are working or playing together in a coop...,Spoken,Adults (including young adults),Feminine,"No, all voices are normal-sounding",2,1,...,3,1,3,3,4,4,1,2,2,4
2LZjLBipdfI,Dolls,Indoors,Girls/women,They do not interact with each other or there ...,BOTH spoken and sung,Adults (including young adults),Feminine,Yes a feminine voice is gender exaggerated,3,3,...,1,1,3,2,1,2,1,3,4,3
