# Preprocessing functions

## Imports and paths

In [35]:
import pandas as pd
import os 

cur_dir = os.getcwd()
data_dir = os.path.join(cur_dir, 'data')
classichit_file = os.path.join(data_dir, 'ClassicHit_raw.csv')
geogenre_file = os.path.join(data_dir, 'GeoGenre_raw.csv')

In [36]:
classichit_df = pd.read_csv(classichit_file)
classichit_df

Unnamed: 0,Track,Artist,Year,Duration,Time_Signature,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Popularity,Genre
0,Hey Jack Kerouac,"10,000 Maniacs",1987,206413,4,0.616,0.511,6,-15.894,1,0.0279,0.03840,0.000000,0.1500,0.604,132.015,40,Alt. Rock
1,Like the Weather,"10,000 Maniacs",1987,236653,4,0.770,0.459,1,-17.453,1,0.0416,0.11200,0.003430,0.1450,0.963,133.351,43,Alt. Rock
2,What's the Matter Here?,"10,000 Maniacs",1987,291173,4,0.593,0.816,9,-7.293,1,0.0410,0.00449,0.000032,0.0896,0.519,99.978,12,Alt. Rock
3,Trouble Me,"10,000 Maniacs",1989,193560,4,0.861,0.385,2,-10.057,1,0.0341,0.15400,0.000000,0.1230,0.494,117.913,47,Alt. Rock
4,Candy Everybody Wants,"10,000 Maniacs",1992,185960,4,0.622,0.876,10,-6.310,1,0.0305,0.01930,0.006840,0.0987,0.867,104.970,43,Alt. Rock
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15145,7 Seconds (feat. Neneh Cherry),Youssou N'Dour,1994,306226,4,0.679,0.659,1,-10.826,0,0.0269,0.07710,0.000004,0.3160,0.528,154.171,47,World
15146,Birima,Youssou N'Dour,1995,228373,4,0.692,0.737,10,-7.042,1,0.0512,0.25400,0.017400,0.1120,0.808,105.754,18,World
15147,Set,Youssou N'Dour,2004,226826,4,0.733,0.730,0,-11.056,1,0.0479,0.04240,0.000101,0.0282,0.840,146.213,25,World
15148,Africa Remembers,Youssou N'Dour,2007,560466,4,0.696,0.378,5,-16.181,0,0.0797,0.44400,0.019400,0.0850,0.551,97.677,8,World


In [37]:
geogenre_df = pd.read_csv(geogenre_file)
geogenre_df

Unnamed: 0.1,Unnamed: 0,Country,Hip hop/Rap/R&b,EDM,Pop,Rock/Metal,Latin/Reggaeton,Other
0,0,australia,31,15.0,65,17.0,0.0,8
1,1,canada,39,12.0,64,14.0,0.0,10
2,2,china,11,23.0,108,4.0,0.0,21
3,3,india,25,1.0,39,0.0,0.0,55
4,4,mexico,38,0.0,18,2.0,57.0,43
...,...,...,...,...,...,...,...,...
62,66,czech_republic,42,21.0,36,17.0,2.0,9
63,68,mongolia,108,0.0,113,16.0,0.0,17
64,69,pakistan,32,1.0,39,1.0,0.0,28
65,71,saudi_arabia,11,7.0,57,12.0,0.0,19


## Matching genres between both datasets

In [38]:
classichit_df['Genre'].unique()

array(['Alt. Rock', 'Blues', 'Country', 'Disco', 'EDM', 'Folk', 'Funk',
       'Gospel', 'Jazz', 'Metal', 'Pop', 'Punk', 'R&B', 'Rap', 'Reggae',
       'Rock', 'SKA', 'Today', 'World'], dtype=object)

In [39]:
genre_list = list(geogenre_df.columns)
genre_list.remove('Country')
genre_list.remove('Unnamed: 0')

genre_list

['Hip hop/Rap/R&b', 'EDM', 'Pop', 'Rock/Metal', 'Latin/Reggaeton', 'Other']

### Genre proportion count

In [40]:
geogenre_df["tot"] = geogenre_df[genre_list].sum(axis=1)

for genre in genre_list:
    geogenre_df[genre] = geogenre_df[genre]/geogenre_df["tot"]
geogenre_df.drop(columns=["Unnamed: 0", "tot"], inplace=True)

geogenre_df

Unnamed: 0,Country,Hip hop/Rap/R&b,EDM,Pop,Rock/Metal,Latin/Reggaeton,Other
0,australia,0.227941,0.110294,0.477941,0.125000,0.000000,0.058824
1,canada,0.280576,0.086331,0.460432,0.100719,0.000000,0.071942
2,china,0.065868,0.137725,0.646707,0.023952,0.000000,0.125749
3,india,0.208333,0.008333,0.325000,0.000000,0.000000,0.458333
4,mexico,0.240506,0.000000,0.113924,0.012658,0.360759,0.272152
...,...,...,...,...,...,...,...
62,czech_republic,0.330709,0.165354,0.283465,0.133858,0.015748,0.070866
63,mongolia,0.425197,0.000000,0.444882,0.062992,0.000000,0.066929
64,pakistan,0.316832,0.009901,0.386139,0.009901,0.000000,0.277228
65,saudi_arabia,0.103774,0.066038,0.537736,0.113208,0.000000,0.179245


### Drop duplicates on countries

In [41]:
geogenre_df.drop_duplicates(subset='Country', keep='first', inplace=True)

In [42]:
geogenre_df.to_csv(os.path.join(data_dir, 'GeoGenre_cleaned.csv'), index=False)