In [2]:
import os
import pandas as pd
import numpy as np
import yaml


quantiles_thresholds = [0.33, 0.66]

mm_ratings_df = pd.read_csv("mm_ratings.csv")
mf_ratings_df = pd.read_csv("mf_ratings.csv")
groundtruth_df = pd.read_csv("../files/groundtruth.csv", index_col="stimulus_id")

for ratings_df in [mm_ratings_df, mf_ratings_df]:
    ratings_df = ratings_df.drop(["prolific_id"], axis=1)
    ratings_means_df = ratings_df.groupby('stimulus_id').mean()
    # merge with groundtruth
    groundtruth_df = pd.merge(groundtruth_df, ratings_means_df, left_index=True, right_index=True)

# drop these columns
drop_these = [ 
    'asian','black', 'white', 'other', 'title', 'description', 'upload_date',
    'duration', 'view_count', 'categories', 'tags', 'like_count','requested_subtitles', 
    'download', 'error_logs',
]

groundtruth_df = groundtruth_df.drop(drop_these, axis=1)

print(groundtruth_df.columns)
display(groundtruth_df.head())

# bin the values to 3 categories
for column in groundtruth_df.columns:
    if groundtruth_df[column].dtype == 'float64':
        quantiles = groundtruth_df[column].quantile(quantiles_thresholds).tolist() 
        groundtruth_df[column] = pd.cut(groundtruth_df[column], bins=[-np.inf, quantiles[0], quantiles[1], np.inf], labels=[0, 1, 2])
        groundtruth_df[column] = groundtruth_df[column].astype('int64')

groundtruth_df.head()

# save the groundtruth
groundtruth_df.to_csv('music_groundtruth.csv')

# ROUND 2 (continue)
with open('config_training.yaml', 'r') as f:
    config = yaml.safe_load(f)

fold_music_df = pd.read_csv('music_groundtruth.csv', index_col="stimulus_id")

emo_to_label = {
    0: "not",
    1: "moderately",
    2: "very"
}

for emotion in config["emo_dict"].keys():
    fold_music_df[emotion] = fold_music_df[emotion].map(emo_to_label)

for mid in config["mid_dict"].keys():
    fold_music_df[mid][fold_music_df[mid] == 0] = mid.split("/")[0]
    fold_music_df[mid][fold_music_df[mid] == 1] = "undefined"
    fold_music_df[mid][fold_music_df[mid] == 2] = mid.split("/")[1]
    fold_music_df[mid] = fold_music_df[mid].astype(str)

cols = (
    list(config["emo_dict"].keys()) + list(config["mid_dict"].keys()) + list(config["cls_dict"]) + ["all_genders"]
)

cols.remove("target_of_toy_ad")

fold_music_df = fold_music_df[cols]

# rename column "all_genders" to "target_of_toy_ad"
fold_music_df = fold_music_df.rename(columns={"all_genders": "target_of_toy_ad"})

fold_music_df.to_csv('music_groundtruth.csv')

Index(['product_category', 'filming_location', 'all_genders', 'interaction',
       'voice_type', 'voice_age', 'voice_gender', 'voice_exagg', 'Happy',
       'Amusing', 'Beauty', 'Calm', 'Energizing', 'Angry', 'Triumphant',
       'Electric/Acoustic', 'Distorted/Clear',
       'Many Instruments/Few Instruments', 'Loud/Soft', 'Heavy/Light',
       'High pitch/Low pitch', 'Wide pitch variation/Narrow pitch variation',
       'Punchy/Smooth', 'Harmonious/Disharmonious', 'Clear melody/No melody',
       'Repetitive/Non-repetitive', 'Complex rhythm/Simple rhythm',
       'Fast tempo/Slow tempo', 'Dense/Sparse', 'Strong beat/Weak beat'],
      dtype='object')


Unnamed: 0_level_0,product_category,filming_location,all_genders,interaction,voice_type,voice_age,voice_gender,voice_exagg,Happy,Amusing,...,High pitch/Low pitch,Wide pitch variation/Narrow pitch variation,Punchy/Smooth,Harmonious/Disharmonious,Clear melody/No melody,Repetitive/Non-repetitive,Complex rhythm/Simple rhythm,Fast tempo/Slow tempo,Dense/Sparse,Strong beat/Weak beat
stimulus_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ndzo2ZIWfiQ,High-tech Interactive Playmates and Robotics,Non-specific,Girls/women,They do not interact with each other or there ...,BOTH spoken and sung,Adults (including young adults),Feminine,"No, all voices are normal-sounding",5.666667,2.666667,...,3.714286,4.714286,4.142857,3.0,3.285714,3.428571,4.857143,3.428571,3.142857,4.0
yRUiwRKk6QM,High-tech Interactive Playmates and Robotics,Indoors,Mixed,They do not interact with each other or there ...,Spoken,Adults (including young adults),Masculine,Yes a masculine voice is gender exaggerated,3.666667,3.833333,...,4.5,2.333333,2.666667,4.166667,5.666667,5.5,3.5,4.333333,2.833333,4.5
3ysC1-foJT4,"Apparel, Fashion, Accessories, Cosmetics, Cost...",Indoors,Girls/women,They are working or playing together in a coop...,Sung,Adults (including young adults),Feminine,Yes a feminine voice is gender exaggerated,4.833333,3.166667,...,4.333333,4.333333,2.666667,3.5,3.333333,1.833333,4.166667,2.833333,3.666667,2.166667
cYszuGaptkk,"Action Figures, Battling Toys and Toy Weapons",Non-specific,Mixed,They are working or playing together in a coop...,Spoken,Adults (including young adults),Feminine,"No, all voices are normal-sounding",4.463576,2.97351,...,4.171053,4.164474,3.723684,4.039474,4.901316,3.986842,4.5,3.440789,3.802632,4.526316
2LZjLBipdfI,Dolls,Indoors,Girls/women,They do not interact with each other or there ...,BOTH spoken and sung,Adults (including young adults),Feminine,Yes a feminine voice is gender exaggerated,5.5,3.666667,...,3.666667,4.166667,3.833333,3.5,3.0,3.166667,4.5,3.5,4.833333,4.166667


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fold_music_df[mid][fold_music_df[mid] == 0] = mid.split("/")[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fold_music_df[mid][fold_music_df[mid] == 0] = mid.split("/")[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fold_music_df[mid][fold_music_df[mid] == 0] = mid.split("/")[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning