## Pre processing data

In [100]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style="ticks", color_codes=True)

import warnings
warnings.filterwarnings('ignore')

from numpy import random

In [176]:
file = 'data/002_intermediate_data/raw_renamed_data.csv'
raw_data = pd.read_csv(file, encoding='latin-1')
print('shape raw:', raw_data.shape)
raw_data.head(3)

shape raw: (603, 15)


Unnamed: 0,id,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
1,2,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
2,3,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80


## Classes by `top genre`: balance

In [8]:
raw_data['top genre'].nunique()

50

In [13]:
songs_by_genre = raw_data.groupby(by='top genre', as_index=False)['id'].agg({'songs': 'count'}) \
        .sort_values('songs', ascending=False) \
        .reset_index(drop=True)
songs_by_genre.rename(columns={'top genre': 'genre'}, inplace=True)
songs_by_genre

Unnamed: 0,genre,songs
0,dance pop,327
1,pop,60
2,canadian pop,34
3,boy band,15
4,barbadian pop,15
5,electropop,13
6,british soul,11
7,big room,10
8,canadian contemporary r&b,9
9,neo mellow,9


In [14]:
# Minumum 5 songs genres
songs_by_genre[songs_by_genre['songs'] >= 5]

Unnamed: 0,genre,songs
0,dance pop,327
1,pop,60
2,canadian pop,34
3,boy band,15
4,barbadian pop,15
5,electropop,13
6,british soul,11
7,big room,10
8,canadian contemporary r&b,9
9,neo mellow,9


## Group by super genres

The name at the end of the genre can indicate a super genre.

In [28]:
# Pop super genre
pop = songs_by_genre['genre'].str.endswith('pop')
print(f'Sum of genres: {songs_by_genre.genre[pop].count()}')
print(f'Sum of songs: {sum(songs_by_genre.songs[pop])}')
songs_by_genre[pop]

Sum of genres: 17
Sum of songs: 484


Unnamed: 0,genre,songs
0,dance pop,327
1,pop,60
2,canadian pop,34
4,barbadian pop,15
5,electropop,13
10,art pop,8
11,hip pop,6
14,australian pop,5
20,colombian pop,3
22,acoustic pop,2


In [29]:
# Hip hop super genre
hiphop = songs_by_genre['genre'].str.endswith('hip hop')
print(f'Sum of genres: {songs_by_genre.genre[hiphop].count()}')
print(f'Sum of songs: {sum(songs_by_genre.songs[hiphop])}')
songs_by_genre[hiphop]

Sum of genres: 5
Sum of songs: 14


Unnamed: 0,genre,songs
15,atl hip hop,5
18,hip hop,4
25,canadian hip hop,2
27,detroit hip hop,2
37,australian hip hop,1


In [30]:
# Indie super genre
indie = songs_by_genre['genre'].str.endswith('indie')
print(f'Sum of genres: {songs_by_genre.genre[indie].count()}')
print(f'Sum of songs: {sum(songs_by_genre.songs[indie])}')
songs_by_genre[indie]

Sum of genres: 1
Sum of songs: 1


Unnamed: 0,genre,songs
47,alaska indie,1


In [32]:
# Rock super genre
rock = songs_by_genre['genre'].str.endswith('rock')
print(f'Sum of genres: {songs_by_genre.genre[rock].count()}')
print(f'Sum of songs: {sum(songs_by_genre.songs[rock])}')
songs_by_genre[rock]

Sum of genres: 1
Sum of songs: 1


Unnamed: 0,genre,songs
43,celtic rock,1


In [33]:
# Dance super genre
dance = songs_by_genre['genre'].str.endswith('dance')
print(f'Sum of genres: {songs_by_genre.genre[dance].count()}')
print(f'Sum of songs: {sum(songs_by_genre.songs[dance])}')
songs_by_genre[dance]

Sum of genres: 1
Sum of songs: 6


Unnamed: 0,genre,songs
13,australian dance,6


In [40]:
# getting all possible subgenres
last_genre_name = songs_by_genre.genre.str.split().apply(lambda x: x[-1])
last_genre_name.drop_duplicates().reset_index(drop=True)

0                   pop
1                  band
2            electropop
3                  soul
4                  room
5                   r&b
6                mellow
7            complextro
8                 dance
9                   hop
10                  edm
11                 wave
12                latin
13                house
14             folk-pop
15              brostep
16            downtempo
17              electro
18                 trap
19         metropopolis
20    singer-songwriter
21                  rap
22            hollywood
23                 rock
24                indie
25              country
Name: genre, dtype: object

> **Conclusion:** we can have half the number of genres if we get the super genre as proposed.

## Using supergenre

In [42]:
songs_by_genre['supergenre'] = last_genre_name
songs_by_genre.head(10)

Unnamed: 0,genre,songs,supergenre
0,dance pop,327,pop
1,pop,60,pop
2,canadian pop,34,pop
3,boy band,15,band
4,barbadian pop,15,pop
5,electropop,13,electropop
6,british soul,11,soul
7,big room,10,room
8,canadian contemporary r&b,9,r&b
9,neo mellow,9,mellow


In [44]:
songs_by_genre.groupby(by='supergenre', as_index=False)['songs'].agg({'songs': 'sum'}) \
        .sort_values('songs', ascending=False) \
        .reset_index(drop=True)

Unnamed: 0,supergenre,songs
0,pop,469
1,band,15
2,hop,14
3,electropop,13
4,room,12
5,soul,11
6,r&b,10
7,mellow,9
8,edm,7
9,complextro,6


#### Attention to

`hip hop` -> name cannot be splitted.

Some pop genres have a representative number of songs to remain as individual genres.

* dance pop: 327
* pop: 60
* canadian pop: 34
* barbadian pop: 15

### Function to map genre to supergenre

In [51]:
def map_supergenre(genre: str) -> str:
    """Maps a genre with its supergenre considering special cases when the genre itself is returned.
    """
    exceptions = ['dance pop', 'pop', 'canadian pop', 'barbadian pop']
    if genre in exceptions:
        return genre
    elif genre.endswith('hip hop'):
        return 'hip hop'
    else:
        return genre.split()[-1]
    
[map_supergenre(x) for x in ['dance pop', 'pop', 'hip hop', 'new hip hop', 'indie pop', 
                             'australian pop','colombian pop','acoustic pop','folk-pop']]

['dance pop',
 'pop',
 'hip hop',
 'hip hop',
 'pop',
 'pop',
 'pop',
 'pop',
 'folk-pop']

In [62]:
# supergenre adjusted
songs_by_genre['supergenre_adj'] = songs_by_genre.genre.apply(map_supergenre)
songs_by_genre.head(15)

Unnamed: 0,genre,songs,supergenre,supergenre_adj
0,dance pop,327,pop,dance pop
1,pop,60,pop,pop
2,canadian pop,34,pop,canadian pop
3,boy band,15,band,band
4,barbadian pop,15,pop,barbadian pop
5,electropop,13,electropop,electropop
6,british soul,11,soul,soul
7,big room,10,room,room
8,canadian contemporary r&b,9,r&b,r&b
9,neo mellow,9,mellow,mellow


In [65]:
songs_by_supergenre = songs_by_genre.groupby(by='supergenre_adj', as_index=False)['songs'].agg({'songs': 'sum'}) \
        .sort_values('songs', ascending=False) \
        .reset_index(drop=True)
songs_by_supergenre

Unnamed: 0,supergenre_adj,songs
0,dance pop,327
1,pop,93
2,canadian pop,34
3,band,15
4,barbadian pop,15
5,hip hop,14
6,electropop,13
7,room,12
8,soul,11
9,r&b,10


### Criteria for selecting songs of each class

1. Only genres with at least 5 songs will be allowed.
2. For genres with more than 15 songs, only 15 will be chosen randomly.

#### First criterion

In [71]:
minimum_songs = songs_by_supergenre.supergenre_adj[songs_by_supergenre.songs >= 5]
minimum_songs

0         dance pop
1               pop
2      canadian pop
3              band
4     barbadian pop
5           hip hop
6        electropop
7              room
8              soul
9               r&b
10           mellow
11              edm
12       complextro
13            dance
14            latin
15            house
Name: supergenre_adj, dtype: object

In [68]:
raw_data.head()

Unnamed: 0,id,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
1,2,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
2,3,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80
3,4,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79
4,5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78


In [69]:
raw_data.rename(columns={'top genre': 'genre'}, inplace=True)
raw_data.head()

Unnamed: 0,id,title,artist,genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
1,2,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
2,3,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80
3,4,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79
4,5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78


In [70]:
raw_data['supergenre'] = raw_data.genre.apply(map_supergenre)
raw_data.head()

Unnamed: 0,id,title,artist,genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,supergenre
0,1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83,mellow
1,2,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82,hip hop
2,3,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80,dance pop
3,4,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79,dance pop
4,5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78,pop


In [93]:
print('raw - shape:', raw_data.shape)
print('raw - unique supergenre:', raw_data.supergenre.nunique())

raw - shape: (603, 16)
raw - unique supergenre: 29


In [159]:
minimum_crit = raw_data['supergenre'].isin(list(minimum_songs)) # supergenre has at least 5 songs
preprocessed_data = raw_data[minimum_crit]

print('preprocessed - shape:', preprocessed_data.shape)
print('preprocessed - unique supergenre:', preprocessed_data.supergenre.nunique())
preprocessed_data.supergenre.drop_duplicates()

preprocessed - shape: (582, 16)
preprocessed - unique supergenre: 16


0             mellow
1            hip hop
2          dance pop
4                pop
5       canadian pop
9      barbadian pop
50              room
52              soul
108             band
154       complextro
225            house
240       electropop
252            dance
270              r&b
334              edm
440            latin
Name: supergenre, dtype: object

In [160]:
preprocessed_data.head()

Unnamed: 0,id,title,artist,genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,supergenre
0,1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83,mellow
1,2,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82,hip hop
2,3,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80,dance pop
3,4,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79,dance pop
4,5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78,pop


#### Second criterion

In [99]:
maximum_songs = songs_by_supergenre[songs_by_supergenre.songs > 15]
maximum_songs

Unnamed: 0,supergenre_adj,songs
0,dance pop,327
1,pop,93
2,canadian pop,34


In [119]:
#from numpy.random import default_rng

rng = random.default_rng()
numbers = rng.choice(20, size=10, replace=False)
print(numbers)

[19  8  9  6 12 15 18  3  1 11]


In [124]:
# non repeating numbers -> using Numpy's default_rng() and choice(replace=False)
rng = random.default_rng()
nsongs = 15

value = int(maximum_songs.songs[maximum_songs.supergenre_adj == 'dance pop'])
rand_dance_pop = rng.choice(327, size=nsongs, replace=False)
print(value)
print(rand_dance_pop)

value = int(maximum_songs.songs[maximum_songs.supergenre_adj == 'pop'])
rand_pop = rng.choice(value, size=nsongs, replace=False)
print(value)
print(rand_pop)

value = int(maximum_songs.songs[maximum_songs.supergenre_adj == 'canadian pop'])
rand_canadian_pop = rng.choice(value, size=nsongs, replace=False)
print(value)
print(rand_canadian_pop)

327
[126 212 259 200  26 257 233 100  99  55 296 319 250 248 193]
93
[55  3 53 33  2 82 39 83 15 74 10 13 48 46 37]
34
[20 24 21 26 14 22  8  7 16 10 28 19 12 25 11]


In [129]:
list(rand_dance_pop)

[126, 212, 259, 200, 26, 257, 233, 100, 99, 55, 296, 319, 250, 248, 193]

In [145]:
ids = list(preprocessed_data[preprocessed_data.supergenre == 'dance pop']['id'])
print(len(ids))
selected_ids = [ids[r] for r in list(rand_dance_pop)]
print(selected_ids)

final_selected_ids = []
final_selected_ids.extend(selected_ids)
print(final_selected_ids)

327
[213, 381, 467, 359, 41, 461, 411, 172, 171, 87, 534, 572, 440, 438, 349]
[213, 381, 467, 359, 41, 461, 411, 172, 171, 87, 534, 572, 440, 438, 349]


In [146]:
ids = list(preprocessed_data[preprocessed_data.supergenre == 'pop']['id'])
print(len(ids))
selected_ids = [ids[r] for r in list(rand_pop)]
print(selected_ids)

final_selected_ids.extend(selected_ids)
print(final_selected_ids)

93
[293, 22, 275, 175, 12, 570, 216, 571, 69, 511, 44, 60, 259, 235, 212]
[213, 381, 467, 359, 41, 461, 411, 172, 171, 87, 534, 572, 440, 438, 349, 293, 22, 275, 175, 12, 570, 216, 571, 69, 511, 44, 60, 259, 235, 212]


In [147]:
ids = list(preprocessed_data[preprocessed_data.supergenre == 'canadian pop']['id'])
print(len(ids))
selected_ids = [ids[r] for r in list(rand_canadian_pop)]
print(selected_ids)

final_selected_ids.extend(selected_ids)
print(final_selected_ids)

34
[397, 434, 412, 451, 321, 416, 276, 272, 331, 281, 513, 379, 315, 447, 302]
[213, 381, 467, 359, 41, 461, 411, 172, 171, 87, 534, 572, 440, 438, 349, 293, 22, 275, 175, 12, 570, 216, 571, 69, 511, 44, 60, 259, 235, 212, 397, 434, 412, 451, 321, 416, 276, 272, 331, 281, 513, 379, 315, 447, 302]


In [148]:
len(final_selected_ids)

45

In [150]:
maximum_data = preprocessed_data[preprocessed_data['id'].isin(final_selected_ids)].reset_index(drop=True)
print('shape:', maximum_data.shape)
maximum_data

shape: (45, 16)


Unnamed: 0,id,title,artist,genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,supergenre
0,12,Marry You,Bruno Mars,pop,2010,145,83,62,-5,10,48,230,33,4,73,pop
1,22,Whataya Want from Me,Adam Lambert,australian pop,2010,186,68,44,-5,6,45,227,1,5,66,pop
2,41,Something's Got A Hold On Me - Burlesque Origi...,Christina Aguilera,dance pop,2010,150,85,51,-4,12,72,185,47,27,58,dance pop
3,44,Loca,Shakira,colombian pop,2010,112,87,80,-4,9,85,193,19,7,56,pop
4,60,Grenade,Bruno Mars,pop,2011,110,56,71,-7,12,23,223,15,6,75,pop
5,69,Price Tag,Jessie J,australian pop,2011,175,83,64,-4,27,68,223,3,18,72,pop
6,87,Marry The Night,Lady Gaga,dance pop,2011,131,88,61,-4,46,38,265,0,6,61,dance pop
7,171,If I Lose Myself - Alesso vs OneRepublic,OneRepublic,dance pop,2013,126,75,52,-6,14,16,215,26,4,69,dance pop
8,172,The Way,Ariana Grande,dance pop,2013,82,88,65,-3,8,86,227,29,11,68,dance pop
9,175,I Love It (feat. Charli XCX),Icona Pop,candy pop,2013,126,91,71,-3,15,82,157,1,3,67,pop


Drop lines corresponding to the top 3 supergenres

In [165]:
print('before shape:', preprocessed_data.shape)

before shape: (582, 16)


In [166]:
supressed_data = preprocessed_data.copy()

drop_indexes = supressed_data[supressed_data.supergenre == 'dance pop'].index
supressed_data.drop(drop_indexes, inplace=True)

drop_indexes = supressed_data[supressed_data.supergenre == 'pop'].index
supressed_data.drop(drop_indexes, inplace=True)

drop_indexes = supressed_data[supressed_data.supergenre == 'canadian pop'].index
supressed_data.drop(drop_indexes, inplace=True)

In [167]:
print('shape after:', supressed_data.shape)
print('unique after:', supressed_data.supergenre.unique())

shape after: (128, 16)
unique after: ['mellow' 'hip hop' 'barbadian pop' 'room' 'soul' 'band' 'complextro'
 'house' 'electropop' 'dance' 'r&b' 'edm' 'latin']


In [172]:
merged_processed_data = pd.concat([supressed_data, maximum_data]).reset_index(drop=True)
print('shape merged (expected 128 + 45 lines):', merged_processed_data.shape)
merged_processed_data.head()

shape merged (expected 128 + 45 lines): (173, 16)


Unnamed: 0,id,title,artist,genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,supergenre
0,1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83,mellow
1,2,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82,hip hop
2,10,Only Girl (In The World),Rihanna,barbadian pop,2010,126,72,79,-4,7,61,235,13,4,73,barbadian pop
3,16,OMG (feat. will.i.am),Usher,atl hip hop,2010,130,75,78,-6,36,33,269,20,3,72,hip hop
4,43,Hard,Rihanna,barbadian pop,2010,182,75,31,-4,65,16,251,1,11,57,barbadian pop


In [173]:
# analysing the final supergenre groups
merged_processed_data.groupby(by='supergenre', as_index=False)['title'].agg({'songs': 'count'}) \
        .sort_values('songs', ascending=False) \
        .reset_index(drop=True)

Unnamed: 0,supergenre,songs
0,band,15
1,barbadian pop,15
2,canadian pop,15
3,dance pop,15
4,pop,15
5,hip hop,14
6,electropop,13
7,room,12
8,soul,11
9,r&b,10


## Saving preprocessed file

The new dataset is a subset of the original where the two following criteria were fulfilled.

1. Only genres with at least 5 songs will be allowed. 16 of the 29 supergenres passed this criterion.
2. For genres with more than 15 songs, only 15 will be chosen randomly. Those genres were `dance pop`, `pop` and `canadian pop`.

In [177]:
print('final shape:', merged_processed_data.shape)
merged_processed_data.head(3)

final shape: (173, 16)


Unnamed: 0,id,title,artist,genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,supergenre
0,1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83,mellow
1,2,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82,hip hop
2,10,Only Girl (In The World),Rihanna,barbadian pop,2010,126,72,79,-4,7,61,235,13,4,73,barbadian pop


In [175]:
merged_processed_data.to_csv('data/002_intermediate_data/preprocessed_data.csv', index=False, encoding='latin-1')