# Create mp3 list for genre classes

### Imports

In [1]:
import pickle
import pandas as pd

### Load data

In [2]:
df = pd.read_csv('track_data_trimmed.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147100 entries, 0 to 147099
Data columns (total 9 columns):
id                 147100 non-null int64
release_artist     147100 non-null object
release_cat_num    147097 non-null object
release_date       147100 non-null object
release_genre      147100 non-null object
release_label      147100 non-null object
release_title      147100 non-null object
track_name         147100 non-null object
track_url          147100 non-null object
dtypes: int64(1), object(8)
memory usage: 10.1+ MB


### Genre counts & map to parent genres

In [4]:
track_genres = df.groupby('release_genre')['id'].count().sort_values(ascending=False).reset_index()
track_genres

Unnamed: 0,release_genre,id
0,Minimal/Tech House,29415
1,Progressive House,14515
2,Funky/Club House,13517
3,Deep House,13098
4,Techno,12394
5,Uplifting Trance,8606
6,Electro House,7169
7,Drum And Bass,6381
8,Dirty Dubstep/Trap/Grime,4164
9,Breakbeat,3829


Lots of genres, and lots with too few songs.

I'm going to map some of these smaller genres to a bigger parent genre and then remove ones that I'm not going to use.

In [5]:
def parent_genre(s):
    
    parent_genres = {
    'Minimal/Tech House':'Minimal House',
    'Progressive House':'Progressive House',
    'Funky/Club House':'Funky House',
    'Deep House':'Deep House',
    'Techno':'Techno',
    'Uplifting Trance':'Trance',
    'Electro House':'Electro House',
    'Drum And Bass':'Drum And Bass',
    'Dirty Dubstep/Trap/Grime':'Dubstep/Grime',
    'Breakbeat':'Breakbeat',
    'Disco/Nu-Disco':'Disco',
    'Balearic/Downtempo':'Downtempo',
    'Euro Dance/Pop Dance':'Euro Dance',
    'Hip Hop/R&B':'Hip Hop/R&B',
    'Hardstyle':'Not Needed',
    'Psy/Goa Trance':'Not Needed',
    'Dancehall/Ragga':'Dancehall/Ragga',
    'Hard Trance':'Hard Trance',
    'Indie':'Rock/Indie',
    'UK Hardcore':'Not Needed',
    'Hard House':'Hard House',
    'Experimental/Electronic':'Not Needed',
    'Pop Trance':'Not Needed',
    'Bass':'Not Needed',
    'Broken Beat/Nu Jazz':'Not Needed',
    'Rock':'Rock/Indie',
    'Gabba':'Not Needed',
    'Pop':'Pop',
    'UK Garage':'UK Garage',
    'Electro':'Not Needed',
    'Deep Dubstep':'Dubstep/Grime',
    'Roots/Lovers Rock':'Not Needed',
    'Hard Techno':'Not Needed',
    'Ambient/Drone':'Not Needed',
    'Funk':'Not Needed',
    'Scouse House':'Not Needed',
    'Dub':'Not Needed',
    'Coldwave/Synth':'Not Needed',
    'Jazz':'Not Needed',
    'DJ Tools':'Not Needed',
    'Industrial/Noise':'Not Needed',
    'Footwork/Juke':'Not Needed',
    'Classics/Ska':'Not Needed',
    'International':'Not Needed',
    'Soul':'Not Needed',
    'Soundtracks':'Not Needed',
    'Leftfield':'Not Needed',
    '50s/60s':'Not Needed',
    'Rock (All)':'Rock/Indie'}
    
    parent = parent_genres[s]
    return parent

In [6]:
df['parent_genre'] = df['release_genre'].apply(parent_genre)

In [7]:
df.head()

Unnamed: 0,id,release_artist,release_cat_num,release_date,release_genre,release_label,release_title,track_name,track_url,parent_genre
0,1,CLEAR VIEW feat JESSICA,SB 215-0,10 Sep 08,Progressive House,Songbird Holland,Tell Me,Tell Me - (6:43),http://www.junodownload.com/MP3/SF1354749-02-0...,Progressive House
1,3,_LINDEN,AED 0027DL,05 Feb 16,Rock,AED,Bones/Broken Glass,Bones - (3:17),http://www.junodownload.com/MP3/SF3007568-02-0...,Rock/Indie
2,5,_NYQUIST,AMREC 008,23 Jan 17,Techno,Ambidextrous,Phenoxy,The Golden Sea - (5:16),http://www.junodownload.com/MP3/SF3258293-02-0...,Techno
3,9,_UNSUBSCRIBE_,BNR 154D,22 Apr 16,Techno,Boysnoize Germany,Penultimate,The Opener - (6:45),http://www.junodownload.com/MP3/SF3052441-02-0...,Techno
4,14,-PF,PF 152,23 Dec 15,Indie,Stereo Parrot,Swearing,Swearing - (2:38),http://www.junodownload.com/MP3/SF3222101-02-0...,Rock/Indie


Look at the genres based on newly created parent genre...

In [8]:
df.groupby('parent_genre')['id'].count().sort_values(ascending=False).reset_index()

Unnamed: 0,parent_genre,id
0,Minimal House,29415
1,Progressive House,14515
2,Not Needed,14276
3,Funky House,13517
4,Deep House,13098
5,Techno,12394
6,Trance,8606
7,Electro House,7169
8,Drum And Bass,6381
9,Dubstep/Grime,4760


Remove the not needed 'genre'

In [9]:
not_needed = df[df['parent_genre'] == 'Not Needed']
df = df.drop(not_needed.index)
df.groupby('parent_genre')['id'].count().sort_values(ascending=False).reset_index()

Unnamed: 0,parent_genre,id
0,Minimal House,29415
1,Progressive House,14515
2,Funky House,13517
3,Deep House,13098
4,Techno,12394
5,Trance,8606
6,Electro House,7169
7,Drum And Bass,6381
8,Dubstep/Grime,4760
9,Breakbeat,3829


### Sample 1,000 songs for each genre

I want to take a sample of 1000 songs for each genre.

In [10]:
df_trimmed = df[(df['parent_genre'] != 'Pop') & (df['parent_genre'] != 'UK Garage')]
df_trimmed.groupby('parent_genre')['id'].count().sort_values(ascending=False).reset_index()

Unnamed: 0,parent_genre,id
0,Minimal House,29415
1,Progressive House,14515
2,Funky House,13517
3,Deep House,13098
4,Techno,12394
5,Trance,8606
6,Electro House,7169
7,Drum And Bass,6381
8,Dubstep/Grime,4760
9,Breakbeat,3829


In [11]:
grouped = df_trimmed.groupby('parent_genre', as_index=False)
equal_sample_df = grouped.apply(lambda x: x.sample(1000)).reset_index()
equal_sample_df.drop(['level_0','level_1'], axis=1, inplace=True)

In [12]:
equal_sample_df.groupby('parent_genre')['id'].count().sort_values(ascending=False).reset_index()

Unnamed: 0,parent_genre,id
0,Trance,1000
1,Techno,1000
2,Dancehall/Ragga,1000
3,Deep House,1000
4,Disco,1000
5,Downtempo,1000
6,Drum And Bass,1000
7,Dubstep/Grime,1000
8,Electro House,1000
9,Euro Dance,1000


### Save data

In [18]:
equal_sample_df.to_pickle('final_data.pkl')