In [1]:
%matplotlib inline

import os

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm, sklearn.model_selection


plt.rcParams['figure.figsize'] = (17, 5)


# Tracks

In [2]:
tracks = pd.read_csv('tracks.csv', index_col=0, header=[0, 1])

FileNotFoundError: [Errno 2] File b'tracks.csv' does not exist: b'tracks.csv'

In [0]:
#per-track
tracks['track'].head()

In [0]:
#per-album
tracks['album'].head()

In [0]:
#per-artist
df = tracks['artist']
art = df.loc[:, df.columns.get_level_values(0) == 'name']
len(art['name'].unique())

In [0]:
#per-set
tracks['set'].head()

# Genres

In [0]:
genres = pd.read_csv('genres.csv', index_col=0)


#Displaying genres
print('{} top-level genres'.format(len(genres['top_level'].unique())))
genres.loc[genres['top_level'].unique()].sort_values('#tracks', ascending=False)


# Features

In [0]:
features = pd.read_csv('features.csv', index_col=0, header=[0,1,2])

In [0]:
#access track 5 features.loc[5]

features

In [0]:
print(features.columns.get_level_values(1).unique())

# Echonest

In [0]:
echonest = pd.read_csv('echonest.csv', index_col=0, header=[0, 1, 2])
echonest

## Audio Features

In [0]:
echonest.iloc[:, echonest.columns.get_level_values(1) == 'audio_features'].head()

## Meta Data

In [0]:
echonest.iloc[:, echonest.columns.get_level_values(1) == 'metadata'].head()

## Social Features

In [0]:
echonest.iloc[:, echonest.columns.get_level_values(1) == 'social_features'].head()

## Ranks

In [0]:
echonest.iloc[:, echonest.columns.get_level_values(1) == 'ranks'].head()

## Temporal Features

In [0]:
echonest.iloc[:, echonest.columns.get_level_values(1) == 'temporal_features'].head()

In [0]:
print(features.head())

In [0]:
chroma_cens = features.iloc[:,features.columns.get_level_values(0)=='chroma_cens']
cmean = chroma_cens.iloc[:,chroma_cens.columns.get_level_values(1) == 'mean'];
sets = tracks['set']
#have the means for chroma_cens


subset = sets.iloc[:,sets.columns.get_level_values(0) == 'subset']
small = subset.loc[subset['subset'] == 'small']
small
#"get" small subset df

In [0]:
#get chroma mean features for small subset only
cmean = cmean[cmean.index.isin(small.index)]
cmean_df = cmean.copy()
cmean_df



#get info for small subset
tracks_small = tracks[tracks.index.isin(small.index)]
tracks_sdf = tracks_small.copy()
tsdf = tracks_sdf['track']

#adding genre to dataframe
cmean_df['genre'] = tsdf['genre_top']
cmean_df

In [0]:
sns.set(style='white', color_codes=True)
sns.pairplot(cmean_df, vars=[('chroma_cens', 'mean', '01'), ('chroma_cens', 'mean','02')], hue='genre')

In [0]:
#"get" small subset df(track ids for all songs in small subset)
subset = sets.iloc[:,sets.columns.get_level_values(0) == 'subset']
small = subset.loc[subset['subset'] == 'small']
small


#get main features. Each contains a set of statistics

#chroma energy normalized
chroma_cens = features.iloc[:,features.columns.get_level_values(0)=='chroma_cens']
#logarithmically spaced frequency axis
chroma_cqt = features.iloc[:,features.columns.get_level_values(0) == 'chroma_cqt']

chroma_stft = features.iloc[:,features.columns.get_level_values(0) == 'chroma_stft']
mfcc = features.iloc[:,features.columns.get_level_values(0) == 'mfcc']
rmse = features.iloc[:, features.columns.get_level_values(0) == 'rmse']
tonnetz = features.iloc[:,features.columns.get_level_values(0) == 'tonnetz']

#"get" small subset df 
subset = sets.iloc[:,sets.columns.get_level_values(0) == 'subset']
small = subset.loc[subset['subset'] == 'small']

#use means only
ccens_mean = chroma_cens.iloc[:,chroma_cens.columns.get_level_values(1) == 'mean'];
ccqt_mean = chroma_cqt.iloc[:,chroma_cqt.columns.get_level_values(1) == 'mean']
cstft_mean = chroma_stft.iloc[:, chroma_stft.columns.get_level_values(1) == 'mean']

mfcc_mean = mfcc.iloc[:, mfcc.columns.get_level_values(1) == 'mean']
#mfcc_mean = mfcc_mean.iloc[:, mfcc_mean.columns.get_level_values(2) == '06']

tonnetz_mean = tonnetz.iloc[:, tonnetz.columns.get_level_values(1) == 'mean']
tonnetz_mean = tonnetz_mean.iloc[:, tonnetz_mean.columns.get_level_values(2) == '06']

#combine chroma
chroma_mean = ccens_mean.join(ccqt_mean, lsuffix='_caller', rsuffix='_other')
chroma_mean = chroma_mean.join(cstft_mean, lsuffix='_caller', rsuffix='_other')
chroma_mean = chroma_mean[chroma_mean.index.isin(small.index)]
chroma_mean['root_genre'] = tsdf['genre_top']
sns.set(style='white', color_codes=True)
#just looking at chroma
sns.pairplot(chroma_mean, vars=[('chroma_cens', 'mean', '01'), ('chroma_cens', 'mean', '04'), ('chroma_cqt', 'mean', '01'), ('chroma_cqt', 'mean', '04'), ('chroma_stft', 'mean', '01'), ('chroma_stft', 'mean', '04')], hue='root_genre')



mfcc_mean = mfcc.iloc[:, mfcc.columns.get_level_values(1) == 'mean']
#get small subset
mfcc_mean = mfcc_mean[mfcc_mean.index.isin(small.index)]
feat_df = chroma_mean.copy()
feat_df = feat_df.join(mfcc_mean, lsuffix='_caller', rsuffix='_other')

sns.pairplot(feat_df, vars=[('chroma_cens', 'mean', '01'), ('chroma_cens', 'mean', '04'), ('chroma_cens', 'mean', '07'), ('mfcc', 'mean', '01'), ('mfcc', 'mean', '04'), ('mfcc', 'mean', '07')], hue='root_genre')

sns.pairplot(chroma_mean, vars=[('chroma_cens', 'mean', '01'), ('chroma_cens', 'mean', '02'), ('chroma_cens', 'mean', '03'), ('chroma_cens', 'mean', '04'), ('chroma_cens', 'mean', '05'), ('chroma_cens', 'mean', '06'), ('chroma_cens', 'mean', '07')], hue='root_genre')


In [0]:
def makeKfold(df, k):
    kf = skl.model_selection.KFold(n_splits = k, shuffle = True)
    df_collection = {}
    i = 0
    for train_index, test_index in kf.split(df):
        df_collection[i,"training"] = df.iloc[train_index]
        df_collection[i,"testing"] = df.iloc[test_index]
        i = i + 1
    return df_collection