In [49]:
%matplotlib inline

import os

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import librosa
import librosa.display
import ast
from sklearn import svm

import utils

plt.rcParams['figure.figsize'] = (17, 5)

In [50]:
# Directory where mp3 are stored.
AUDIO_DIR = os.environ.get('AUDIO_DIR')

# Load metadata and features.
tracks = utils.load('data/fma_metadata/tracks.csv')
genres = utils.load('data/fma_metadata/genres.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, genres.shape, features.shape, echonest.shape

((106574, 53), (163, 4), (106574, 518), (13129, 249))

In [51]:
smalltracks  = tracks.loc[tracks['subset']=='small']
smalltracks.shape

(8000, 53)

In [52]:
smallfeatures = features.loc[features.index.isin(smalltracks.index)]
smallfeatures.shape

(8000, 518)

In [53]:
smallechonest = echonest.loc[echonest.index.isin(smalltracks.index)]
smallechonest.shape

(1294, 249)

In [54]:
genres['title'].value_counts()

Symphony                 1
House                    1
Black-Metal              1
Sound Art                1
N. Indian Traditional    1
                        ..
Europe                   1
Free-Folk                1
International            1
Ambient Electronic       1
Disco                    1
Name: title, Length: 163, dtype: int64

In [55]:
smalltracks.columns

MultiIndex([( 'album',          'comments'),
            ( 'album',      'date_created'),
            ( 'album',     'date_released'),
            ( 'album',          'engineer'),
            ( 'album',         'favorites'),
            ( 'album',                'id'),
            ( 'album',       'information'),
            ( 'album',           'listens'),
            ( 'album',          'producer'),
            ( 'album',              'tags'),
            ( 'album',             'title'),
            ( 'album',            'tracks'),
            ( 'album',              'type'),
            ('artist', 'active_year_begin'),
            ('artist',   'active_year_end'),
            ('artist', 'associated_labels'),
            ('artist',               'bio'),
            ('artist',          'comments'),
            ('artist',      'date_created'),
            ('artist',         'favorites'),
            ('artist',                'id'),
            ('artist',          'latitude'),
          

In [56]:
smalltracks['genre'] = smalltracks['track','genre_top']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [57]:
smalltracks['testtrainval'] = smalltracks['set','split']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [58]:
cols = ['genre','testtrainval']

In [73]:
genres = smalltracks[cols]

In [74]:
genres

Unnamed: 0_level_0,genre,testtrainval
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Hip-Hop,training
5,Hip-Hop,training
10,Pop,training
140,Folk,training
141,Folk,training
...,...,...
154308,Hip-Hop,test
154309,Hip-Hop,test
154413,Pop,training
154414,Pop,training


In [75]:
genres.head()

Unnamed: 0_level_0,genre,testtrainval
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Hip-Hop,training
5,Hip-Hop,training
10,Pop,training
140,Folk,training
141,Folk,training


In [76]:
genres.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 2 to 155066
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   (genre, )         8000 non-null   category
 1   (testtrainval, )  8000 non-null   object  
dtypes: category(1), object(1)
memory usage: 133.6+ KB


In [63]:
genres_list = set(list(smalltracks['track','genre_top'].values))

In [64]:
genres_list

{'Electronic',
 'Experimental',
 'Folk',
 'Hip-Hop',
 'Instrumental',
 'International',
 'Pop',
 'Rock'}

In [67]:
smallgenres

Unnamed: 0_level_0,#tracks,parent,title,top_level
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,5271,0,International,2
10,13845,0,Pop,10
12,32923,0,Rock,12
15,34413,0,Electronic,15
17,12706,0,Folk,17
21,8389,0,Hip-Hop,21
38,38154,0,Experimental,38
1235,14938,0,Instrumental,1235


In [77]:
smalltracks.to_csv('data/fma_metadata/smalltracks.csv')
smallfeatures.to_csv('data/fma_metadata/smallfeatures.csv')
smallechonest.to_csv('data/fma_metadata/smallechonest.csv')
smallgenres.to_csv('data/fma_metadata/smallgenres.csv')
genres.to_csv('data/fma_metadata/genrelist.csv')