In [1]:
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import json
import numpy as np
from datetime import datetime
import pickle
import bz2


In [2]:
DATA_ROOT = '/media/gustavo/Storage/Datasets/Onion20k'
DATA_ROOT_URL=Path(DATA_ROOT)

# Exploring songs from lfm2b with spotify features

In [3]:
LFM_DATA_FILE = Path(DATA_ROOT)/'m4av2.json.bz2'
lfm_data= pd.read_json(LFM_DATA_FILE,lines=True)

In [22]:
lfm_data.head()

Unnamed: 0,_id,artist,track,album,lfm,spotify
0,0009fFIM1eYThaPg,Cheryl,Rain on Me,3 Words,"{'track_id': 32553818, 'artist_id': 850845, 'a...","{'uri': '3eObKIfHKJ1nAPh0wTxFCc', 'features': ..."
1,0010xmHR6UICBOYT,Oddisee,After Thoughts,The Beauty in All,"{'track_id': 3947719, 'artist_id': 3049364, 'a...","{'uri': '27szvF97Tu95GxN98N52fy', 'features': ..."
2,002Jyd0vN4HyCpqL,Blue Öyster Cult,ME 262,Secret Treaties,"{'track_id': 25225652, 'artist_id': 648169, 'a...","{'uri': '273lBFpxUCwisTpdnF9cVb', 'features': ..."
3,006TYKNjNxWjfKjy,Rhapsody,Flames of Revenge,Legendary Years (Re-Recorded),"{'track_id': 15264706, 'artist_id': 3410635, '...","{'uri': '7FARxq6SoJNKByv82HFXs3', 'features': ..."
4,007LIJOPQ4Sb98qV,The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remas...,"{'track_id': 28794945, 'artist_id': 3985133, '...","{'uri': '6rVxJ3sN3Cz40MSLavbG1K', 'features': ..."


In [59]:
lfm_data['lfm'].isna().sum()

99

In [12]:
lfm_data.set_index('_id',inplace=True)

In [25]:
lfm_data = pd.concat([lfm_data,lfm_data.iloc[:,3].apply(lambda x: pd.Series(x))],axis=1)
#type(lfm_data['lfm'].values[0])


In [124]:
gems=pd.concat([
    pd.read_csv(Path(DATA_ROOT)/'GEMS-INN_2023-04-18.csv',encoding='latin-1'),
    pd.read_csv(Path(DATA_ROOT)/'GEMS-INN_2023-04-18-2.csv',encoding='latin-1')
],ignore_index=True).iloc[:,1:]
emma_database_merge=gems.merge(lfm_data,right_on=['artist','track'],left_on=['artist','title'],how='inner').drop_duplicates(subset=['artist','track'])

emo_label=pd.read_csv(Path(DATA_ROOT)/'emotions_familiarity_unpacked_labels.tsv')
emma_database_merge= emma_database_merge.merge(emo_label,left_on='_id',right_on='id',how='inner')
emma_database_merge.to_csv(DATA_ROOT_URL/'v3_emma_database_merge.csv')

#  Exploring emotions file

In [4]:
emo_files=pd.read_csv(Path(DATA_ROOT)/'all_possible_gems_scalings.tsv',sep='\t')
id2lfm = pickle.load(bz2.open(DATA_ROOT_URL/'track_ids-to-index.pkl.bz2',mode='rb'))


In [5]:
emo_files['lfm_id']= emo_files['id'].apply(lambda x:id2lfm.get(x))

In [9]:
emo_files['id'][emo_files.lfm_id.isna()]

0        0010xmHR6UICBOYT
33       04FGELz6IwgzTFKA
87       0BNlUFeemdBKefcG
145      0L0vUMdOuWdSFSor
161      0N7zZFuKu5m1cAMq
               ...       
24787    zmzsCLNS0NZIToY1
24806    zqglVIowjob0jRtl
24845    zvX6etE4yEfO9KuU
24850    zwDtHFrJD3APwkL6
24862    zxvLS6V7f4Qd9CqR
Name: id, Length: 471, dtype: object

In [10]:
emo_files.lfm_id.isna().sum()

471

In [11]:
emo_files.dropna(subset=['lfm_id'],inplace=True)
emo_files['lfm_id'] =emo_files['lfm_id'].astype(int)

In [42]:
audio_feat = emo_files.merge(lfm_data,left_on='id',right_on='_id',how='inner').dropna(subset="spotify")
audio_feat.set_index('id',inplace=True)
spotify_features=audio_feat['spotify'].apply(lambda x: pd.Series(x))
spotify_features_audio=spotify_features['features'].apply(lambda x: pd.Series(x))
release_dates= spotify_features.release_date.apply(lambda x:pd.Series(x))['$date']
parsed_release_dates = pd.to_datetime(release_dates,infer_datetime_format=True,errors='coerce')

In [43]:
lfm_listening_counts=audio_feat['lfm'].apply(lambda x:pd.Series(x))['listening_count']
lfm_listening_counts.to_csv(DATA_ROOT_URL/'v3_lfm_listening_counts.csv')

In [44]:
merged_emotion_file = pd.concat([audio_feat,spotify_features_audio,parsed_release_dates],axis=1)
merged_emotion_file.to_csv(DATA_ROOT_URL/'v3_emotion_audiofeatures.csv')

In [45]:
merged_emotion_file.head()

Unnamed: 0_level_0,"(tag, weight)",gems,gems_profile_gems_rescaling,gems_profile_no_rescaling,gems_profile_all_rescaling,genre,microgenre,gems_profile_average,lfm_id,_id,...,acousticness,instrumentalness,liveness,valence,tempo,type,duration_ms,time_signature,0,$date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
006TYKNjNxWjfKjy,"{'power metal': 100, 'symphonic metal': 98, 's...",['power'],{'power': 1.0},{'power': 152.0},{'power': 0.30278884462151395},rock,power metal,{'power': 25.333333333333332},0,006TYKNjNxWjfKjy,...,0.0001,0.0213,0.336,0.187,179.952,audio_features,334600.0,4.0,,NaT
007LIJOPQ4Sb98qV,"{'post punk': 7, 'new wave': 40, 'british i li...",['tenderness'],{'tenderness': 1.0},{'tenderness': 7.0},{'tenderness': 0.02834008097165992},rock,new wave,{'tenderness': 7.0},1,007LIJOPQ4Sb98qV,...,0.00101,0.301,0.14,0.47,123.904,audio_features,326067.0,4.0,,2009-11-23 00:00:00+00:00
00CH4HJdxQQQbJfu,"{'experimental': 100, 'indie': 90, 'indie rock...",['transcendence'],{'transcendence': 1.0},{'transcendence': 70.0},{'transcendence': 0.08139534883720931},rock,experimental,{'transcendence': 23.333333333333332},2,00CH4HJdxQQQbJfu,...,0.305,0.0321,0.101,0.583,152.81,audio_features,175347.0,3.0,,2009-10-19 00:00:00+00:00
00LuPWdOccBb09bW,"{'rock': 100, 'hard rock': 72, 'alternative ro...","['wonder', 'power', 'joyful activation']","{'wonder': 0.2857142857142857, 'power': 0.5714...","{'wonder': 2.0, 'power': 4.0, 'joyful activati...","{'wonder': 0.004032258064516129, 'power': 0.00...",rock,hard rock,"{'wonder': 1.0, 'power': 1.3333333333333333, '...",3,00LuPWdOccBb09bW,...,0.000247,0.388,0.363,0.6,132.059,audio_features,212347.0,4.0,,2008-10-28 00:00:00+00:00
00P2bHdWFkghmDqz,"{'soul': 100, 'british': 47, 'fip': 47, 'funk'...","['wonder', 'joyful activation']","{'wonder': 0.5, 'joyful activation': 0.5}","{'wonder': 8.0, 'joyful activation': 8.0}","{'wonder': 0.011461318051575931, 'joyful activ...",funk / soul,soul,"{'wonder': 8.0, 'joyful activation': 8.0}",4,00P2bHdWFkghmDqz,...,0.452,1e-06,0.0896,0.669,65.195,audio_features,252213.0,3.0,,2012-01-01 00:00:00+00:00


In [52]:
emo_files.columns

Index(['id', '(tag, weight)', 'gems', 'gems_profile_gems_rescaling',
       'gems_profile_no_rescaling', 'gems_profile_all_rescaling', 'genre',
       'microgenre', 'gems_profile_average', 'lfm_id'],
      dtype='object')

In [65]:
gems_cols = {
    "r":"gems_profile_gems_rescaling",
    "nr":"gems_profile_no_rescaling",
    "r_all":"gems_profile_all_rescaling",
    "avg":"gems_profile_average",
}
all_gems= emo_files.copy()
all_gems.set_index('id',inplace=True)
for gem,gem_col in gems_cols.items():
    gems_= all_gems.loc[:,gem_col].apply(lambda x: pd.Series(eval(x))).fillna(0.0)
    gems_ = gems_.add_suffix(f'_{gem}')
    all_gems=pd.concat([all_gems,gems_],axis=1)


In [68]:
all_gems.iloc[0]


(tag, weight)                  {'power metal': 100, 'symphonic metal': 98, 's...
gems                                                                   ['power']
gems_profile_gems_rescaling                                       {'power': 1.0}
gems_profile_no_rescaling                                       {'power': 152.0}
gems_profile_all_rescaling                        {'power': 0.30278884462151395}
genre                                                                       rock
microgenre                                                           power metal
gems_profile_average                               {'power': 25.333333333333332}
lfm_id                                                                         0
power_r                                                                      1.0
tenderness_r                                                                 0.0
transcendence_r                                                              0.0
wonder_r                    

In [69]:
# Saving unpacked gems_profile df
all_gems.to_csv(Path(DATA_ROOT)/'v3_emotions_familiarity_unpacked_labels.tsv')

In [119]:
demo = emo_files.iloc[:10,6].apply(lambda x : [[k,v] for k,v in eval(x).items()])

In [102]:
gems_cols = {
    "r":"gems_profile_gems_rescaling",
    "nr":"gems_profile_no_rescaling",
    "r_all":"gems_profile_all_rescaling",
    "avg":"gems_profile_average",
}
stacked_all_gems= emo_files.copy()
stacked_all_gems.set_index('id',inplace=True)
stacked_scores = []
for gem,gem_col in gems_cols.items():
    gems_= all_gems.loc[:,gem_col].apply(lambda x: pd.Series(eval(x))).stack(dropna=True)
    gems_.index.set_names(['id','gem_label'],inplace=True)
    gems_.name='score'
    gems_= gems_.reset_index()
    gems_['gem_label']=gems_['gem_label'].apply(lambda x: f'{gem}_{x}')
    stacked_scores.append(gems_)
    #gems_ = gems_.add_suffix(f'_{gem}')
         

In [117]:
merged_stacked_labels= pd.concat(stacked_scores,axis=0,ignore_index=True)
merged_stacked_labels['lfm_id']= merged_stacked_labels['id'].apply(lambda x:id2lfm.get(x))
merged_stacked_labels.dropna(inplace=True)
merged_stacked_labels['lfm_id']=merged_stacked_labels['lfm_id'].astype(int)


In [118]:
merged_stacked_labels

Unnamed: 0,id,gem_label,score,lfm_id
0,006TYKNjNxWjfKjy,r_power,1.000000,0
1,007LIJOPQ4Sb98qV,r_tenderness,1.000000,1
2,00CH4HJdxQQQbJfu,r_transcendence,1.000000,2
3,00LuPWdOccBb09bW,r_power,0.571429,3
4,00LuPWdOccBb09bW,r_wonder,0.285714,3
...,...,...,...,...
244347,zzx8CWdM7qkxKQpC,avg_power,7.000000,24406
244348,zzx8CWdM7qkxKQpC,avg_wonder,5.500000,24406
244349,zzx8CWdM7qkxKQpC,avg_joyful activation,4.000000,24406
244350,zzx8CWdM7qkxKQpC,avg_sadness,4.000000,24406


In [119]:
merged_stacked_labels.to_pickle(DATA_ROOT_URL/'v3_emotions_familiarity_stacked_labels.pkl')

In [120]:
unpacked_gems_profile = pd.read_csv(DATA_ROOT_URL / 'v3_emotions_familiarity_unpacked_labels.tsv')
stacked_gems_profile = pd.read_pickle(DATA_ROOT_URL / 'v3_emotions_familiarity_stacked_labels.pkl')

In [121]:
full_data_tracks = unpacked_gems_profile
#full_data_tracks['year']=full_data_tracks.loc[:,'date'].apply(lambda x:int(x.split("-")[0]))
#full_data_tracks['month']=full_data_tracks.loc[:,'date'].apply(lambda x:int(x.split("-")[1]) if len(x.split('-'))>1 else 1)
#full_data_tracks['day']=full_data_tracks.loc[:,'date'].apply(lambda x:int(x.split("-")[2]) if len(x.split('-'))>2 else 1)
#full_data_tracks['release_date']=[datetime(row[0],row[1],row[2]) for row in full_data_tracks.loc[:,['year','month','day']].values]
full_data_tracks.to_csv(DATA_ROOT_URL/'v3_emotions_familiarity_full_feat.tsv',sep='\t',index=False)

In [123]:
full_data_tracks = stacked_gems_profile
#full_data_tracks['year']=full_data_tracks.loc[:,'date'].apply(lambda x:int(x.split("-")[0]))
#full_data_tracks['month']=full_data_tracks.loc[:,'date'].apply(lambda x:int(x.split("-")[1]) if len(x.split('-'))>1 else 1)
#full_data_tracks['day']=full_data_tracks.loc[:,'date'].apply(lambda x:int(x.split("-")[2]) if len(x.split('-'))>2 else 1)
#full_data_tracks['release_date']=[datetime(row[0],row[1],row[2]) for row in full_data_tracks.loc[:,['year','month','day']].values]
full_data_tracks.to_csv(DATA_ROOT_URL/'v3_emotions_familiarity_full_feat_stack.tsv',sep='\t',index=False)

In [148]:
full_data_tracks = pd.read_csv(DATA_ROOT_URL/'v3_emotions_familiarity_full_feat_stack.tsv',sep='\t')

In [149]:
full_data_tracks['date_prec'].isna().sum()

0

In [147]:
full_data_tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16435 entries, 0 to 16434
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   track_uri                  16435 non-null  object 
 1   date                       16435 non-null  object 
 2   date_prec                  16435 non-null  object 
 3   id                         16435 non-null  object 
 4   gems                       16435 non-null  object 
 5   gems_profile               16435 non-null  object 
 6   gems_profile_no_rescaling  16435 non-null  object 
 7   genre                      16435 non-null  object 
 8   microgenre                 16435 non-null  object 
 9   lfm_id                     16435 non-null  int64  
 10  wonder                     16435 non-null  float64
 11  transcendence              16435 non-null  float64
 12  tenderness                 16435 non-null  float64
 13  nostalgia                  16435 non-null  flo

In [234]:
full_data_stacked_tracks = pd.read_csv(DATA_ROOT_URL/'emotions_familiarity_full_feat_stack.tsv',sep='\t')

In [74]:
emo_filesv1=pd.read_csv(Path(DATA_ROOT)/'emotions_familiarity.tsv',sep='\t')