https://github.com/mdeff/fma

In [1]:
import os
import pandas as pd
import sys
import seaborn as sns

In [2]:
PATH = r'/home/max/Documents/data/fma_metadata'

os.chdir(PATH)

class Data():
    
    def __init__(self, nrows=5000):
        self.tracks = pd.read_csv('tracks.csv', header=[0,1], nrows=nrows)
        self.genres = pd.read_csv('genres.csv', header=[0], nrows=nrows)
        self.features = pd.read_csv('features.csv', header=[0,1,2], nrows=nrows)
        self.echonest = pd.read_csv('echonest.csv', header=[0,1,2], nrows=nrows)
        
        # clean up track index and columns
        self.tracks = self.tracks.drop(0)
        ix = self.tracks['Unnamed: 0_level_0']['Unnamed: 0_level_1'].rename('track_id')
        self.tracks = self.tracks.set_index(ix).drop('Unnamed: 0_level_0', axis=1)
        
        # clean genre
        self.genres = self.genres.set_index('genre_id')
        
        # clean features
        ix = self.features.feature.statistics['number'].rename('track_id')
        self.features = self.features.set_index(ix).drop('feature', axis=1).drop('track_id')
        
        # clean echonest
        ix = self.echonest['Unnamed: 0_level_0']['Unnamed: 0_level_1']['Unnamed: 0_level_2'].rename('track_id')
        self.echonest = self.echonest.set_index(ix).drop('Unnamed: 0_level_0', axis=1).drop('track_id')
        
    def segment(self):
        
        # segmenting tracks df
        tracks_groups = {}
        
        for group in self.tracks.columns.levels[0][1:]:
            tracks_groups[group] = self.tracks[group]
        
        # segmenting features df
        groups = set(group[0] for group in self.features.columns)
        features_groups = {group: self.features[group] for group in groups} 
        
        features_subgroups = {}

        for key in features_groups.keys():
            sub_groups = set(sub_group[0] for sub_group in features_groups[key].columns)

            for sub_group in sub_groups:
                features_subgroups[key + '__' + sub_group] = features_groups[key][sub_group]
                
        # segmenting echonest df
        echonest_groups = {group: self.echonest['echonest'][group] for group in self.echonest.columns.levels[1][1:]}

        self.segmented_dfs = {
            'tracks': tracks_groups,
            'genres': self.genres,
            'features': features_subgroups,
            'echonest': echonest_groups
        }
        
    
# write all the segmented dataframes to csv's
def segment2csv(d): 

    for key, val in d.items():
        if isinstance(val, dict):
            segment2csv(val)
        else:
            val.to_csv(key + '.csv')
    


In [3]:
data = Data(nrows=None)

  if (yield from self.run_code(code, result)):
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [6]:
try:
    os.mkdir('segmented')
except FileExistsError:
    pass

os.chdir('segmented')

In [7]:
data.segment()
segment2csv(data.segmented_dfs)

In [10]:
len(os.listdir())

87

### GOAL: split dataframe into smaller dataframese to avoid multi-index

- tracks_groups := dictionary of groups in tracks DF
- genre is already one group
- features_subgroups := split up to subgroup level. Each subgroup is part of one of several larger groups. These are summary statistics of the raw audio data.
    - spectral_centroid
    - chroma_stft
    - rmse
    - spectral_contrast
    - tonnetz
    - spectral_rolloff
    - spectral_bandwidth
    - chroma_cqt
    - zcr
    - chroma_cens
    - mfcc
    - I need to do some research to understand what the above categories mean
    

In [110]:
# first up is tracks

In [44]:
tracks_groups = {}
for group in data.tracks.columns.levels[0][1:]:
    tracks_groups[group] = data.tracks[group]

In [54]:
# genre is already done

In [68]:
groups = set(group[0] for group in data.features.columns)

features_groups = {group: data.features[group] for group in groups}    

In [94]:
features_subgroups = {}

for key in features_groups.keys():
    sub_groups = set(sub_group[0] for sub_group in features_groups[key].columns)

    for sub_group in sub_groups:
        features_subgroups[key + '__' + sub_group] = features_groups[key][sub_group]
    

In [106]:
echonest_groups = {group: data.echonest['echonest'][group] for group in data.echonest.columns.levels[1][1:]}

In [108]:
echonest_groups.keys()

dict_keys(['audio_features', 'metadata', 'ranks', 'social_features', 'temporal_features'])

In [111]:
segmented_dfs = {
    'tracks': tracks_groups,
    'genres': data.genres,
    'features': features_subgroups,
    'echonest': echonest_groups
}

In [118]:
### likely target:
# segmented_dfs['tracks']['track']['genre_top']

#### Save each df as csv in new folder to upload to AWS ubuntu server

In [12]:
os.getcwd()

'/home/max/Documents/data/fma_metadata'

In [15]:
os.mkdir('segmented')
os.listdir()

['raw_echonest.csv',
 'raw_genres.csv',
 'not_found.pickle',
 'features.csv',
 'checksums',
 'segmented',
 'raw_tracks.csv',
 'genres.csv',
 'tracks.csv',
 'README.txt',
 'echonest.csv',
 'raw_albums.csv',
 'raw_artists.csv']

In [16]:
os.chdir('segmented')

In [30]:
# write all the segmented dataframes to csv's
def segment2csv(d):
    for key, val in d.items():
        if isinstance(val, dict):
            segment2csv(val)
        else:
            val.to_csv(key + '.csv')

In [29]:
isinstance(data.segmented_dfs['genres'], dict)

False

In [31]:
d = data.segmented_dfs

segment2csv(d)

In [25]:
is_punk = (data.tracks.track['genre_top'] == 'Punk')
is_folk = (data.tracks.track['genre_top'] == 'Folk')

mask = (is_punk | is_folk)

folkpunk = data.tracks[mask]

In [26]:
folkpunk.shape

(2803, 52)

In [27]:
fpix = folkpunk.index #  fpix = folk punk index

In [28]:
mask = data.features.index.isin(fpix)

data.features[mask].head()

Unnamed: 0_level_0,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
Unnamed: 0_level_1,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
Unnamed: 0_level_2,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
139,-0.020869,0.43233,0.331278,0.829845,2.625593,2.00566,0.907704,0.748191,1.560385,2.565748,...,0.090518,0.017428,0.02149,1.157352,0.26123,0.07076,0.066895,0.000977,0.769163,0.030017
140,0.533579,-0.623885,-1.086205,-1.081079,-0.765151,-0.072282,-0.882913,-0.582376,-0.884749,-0.645214,...,0.157683,0.02807,0.025946,11.052547,0.379395,0.052379,0.036621,0.001953,3.143968,0.057712
141,0.172898,-0.284804,-1.169662,-1.062855,-0.706868,-0.708281,-0.204884,0.023624,-0.64277,-0.786291,...,0.145994,0.024342,0.032111,32.994659,0.415527,0.040267,0.034668,0.00293,4.204097,0.028665
142,-0.58127,3.199484,1.298346,-0.681253,-0.935093,-0.960304,-0.748014,-0.715798,-0.905853,-0.798362,...,0.086817,0.02136,0.022975,4.865969,0.30957,0.06652,0.061035,0.001953,1.625022,0.036644
188,-1.223855,-1.486636,-0.915784,-1.035627,-1.376008,-0.740556,-1.232752,-0.88404,-0.817353,-1.220685,...,0.162705,0.029727,0.031392,12.255019,0.1875,0.036287,0.031738,0.0,2.916365,0.020919


In [9]:
data.features.tonnetz.sample()

Unnamed: 0_level_0,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,max,max,max,max,...,skew,skew,skew,skew,std,std,std,std,std,std
Unnamed: 0_level_1,01,02,03,04,05,06,01,02,03,04,...,03,04,05,06,01,02,03,04,05,06
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
112282,0.107922,0.418343,1.2561,0.528885,0.014112,0.883011,0.09955,0.111765,0.45056,0.330993,...,0.055679,-0.276025,0.145894,0.01764,0.025516,0.024768,0.091384,0.085488,0.020367,0.020438


In [102]:
data.features.tail()

Unnamed: 0_level_0,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
Unnamed: 0_level_1,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
Unnamed: 0_level_2,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
9554,-0.156214,-0.850736,-0.73355,-0.33982,-0.735142,-1.031866,-1.207657,1.595891,-1.051816,-0.68014,...,0.15045,0.024482,0.026694,33.624317,0.404297,0.038651,0.037109,0.0,3.900559,0.027025
9555,-0.669046,-1.168807,-0.679099,-0.875675,0.4576,-0.05106,-0.233837,0.094572,-0.411417,-0.963449,...,0.109007,0.021616,0.021401,61.777828,0.545898,0.054977,0.047852,0.004883,6.913949,0.047982
9556,1.594526,-1.023309,-1.251871,-1.002714,-0.355053,-0.853487,-0.89438,0.242823,-0.908156,-0.827719,...,0.140481,0.021547,0.023205,44.809021,0.557617,0.053423,0.046387,0.00293,6.209476,0.060019
9557,-1.118935,-0.869945,-0.543931,-0.470382,-0.533076,-1.098251,-0.614707,-1.321757,-1.272636,-1.029226,...,0.189529,0.020813,0.021121,4.537343,0.234863,0.030243,0.027344,0.002441,1.229841,0.019639
9558,-1.220509,-1.11919,-0.643716,-0.279869,0.855649,-0.498396,-0.437475,-0.650538,-0.023109,-0.103927,...,0.136201,0.018579,0.021693,32.297672,0.284668,0.027547,0.023926,0.007812,4.630627,0.017431


In [128]:
data.tracks.shape

(106574, 52)

In [132]:
segmented_dfs['tracks']['artist'].shape

(106574, 17)