In [6]:
import os

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast

#import utils

METADATA_DIR = "./fma_metadata/"

def load(filepath):
    """
    Based off code from the fma github
    """
    filename = os.path.basename(filepath)
    if "features" in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])
    if "echonest" in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])
    if "genres" in filename:
        return pd.read_csv(filepath, index_col=0)
    if "tracks" in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [("track", "tags"), ("album", "tags"), ("artist", "tags"),
                   ("track", "genres"), ("track", "genres_all")]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [("track", "date_created"), ("track", "date_recorded"),
                   ("album", "date_created"), ("album", "date_released"),
                   ("artist", "date_created"), ("artist", "active_year_begin"),
                   ("artist", "active_year_end")]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ("small", "medium", "large")
        tracks["set", "subset"] = tracks["set", "subset"].astype(
                "category", categories=SUBSETS, ordered=True)

        COLUMNS = [("track", "genre_top"), ("track", "license"),
                   ("album", "type"), ("album", "information"),
                   ("artist", "bio")]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype("category")

        return tracks

def extract_features(data):
    features = load(METADATA_DIR + "features.csv")

    keywords = [("spectral_centroid", "mean"), ("spectral_centroid", "std"), ("chroma_stft", "mean"), ("chroma_stft", "std")]
    feature_table = {}
    for composer in data:
        feature_table[composer] = []
        tracks = data[composer]
        for track in tracks:
            track_dict = {}
            tid = track.name
            track_dict["title"] = track["track", "title"]
            for keyword in keywords:
                try:
                    track_dict[keyword] = features[keyword].loc[[tid]]["01"].item()
                except KeyError:
                    track_dict[feature] = None
                    continue
            feature_table[composer] += [track_dict]
    return feature_table

def load_data(composers_to_learn=None):
    tracks = load(METADATA_DIR + "tracks.csv")
    #genres = load(METADATA_DIR + "genres.csv")
    features = load(METADATA_DIR + "features.csv")

    tracks = tracks[tracks["track", "genre_top"] == "Classical"]
    tracks = tracks[tracks["track", "composer"].notnull()]
    composer_dict = {}
    for index, row in tracks.iterrows():
        composer = row["track", "composer"]
        if composers_to_learn:
            for c in composers_to_learn:
                if c in composer:
                    composer = composers_to_learn[composers_to_learn.index(c)]
        if composers_to_learn and composer not in composers_to_learn:
            continue
        if composer not in composer_dict:
            composer_dict[composer] = [row]
        else:
            composer_dict[composer] += [row]

    return composer_dict

def count_data(composer_dict):
    count_dict = {}
    for composer in composer_dict:
        count_dict[composer] = len(composer_dict[composer])
    return count_dict

if __name__=="__main__":
    composers_to_learn = ["Bach", "Haydn", "Alkan", "Orff"]
    composer_data = load_data(composers_to_learn)
    count_dict = count_data(composer_data)
    print(count_dict)
    features = extract_features(composer_data)
    print(features)



{'Alkan': 25, 'Orff': 24, 'Haydn': 44, 'Bach': 348}
{'Alkan': [{('chroma_stft', 'std'): 0.22373536229000002, ('chroma_stft', 'mean'): 0.23956035078000001, ('spectral_centroid', 'mean'): 343.8142395, ('spectral_centroid', 'std'): 382.49560547, 'title': 'Alkan, 1ere Suite, Op 31, 1 Lentement'}, {('chroma_stft', 'std'): 0.34158027172, ('chroma_stft', 'mean'): 0.3837146461, ('spectral_centroid', 'mean'): 341.66360474, ('spectral_centroid', 'std'): 341.1151123, 'title': 'Alkan, 1ere Suite, Op 31, 2 Assez lentement'}, {('chroma_stft', 'std'): 0.29972076416, ('chroma_stft', 'mean'): 0.30122366548, ('spectral_centroid', 'mean'): 317.69174194, ('spectral_centroid', 'std'): 327.80584717, 'title': 'Alkan, 1ere Suite, Op 31, 3 Dans le genre ancien, Tres lentement'}, {('chroma_stft', 'std'): 0.37772509456000003, ('chroma_stft', 'mean'): 0.54739999771, ('spectral_centroid', 'mean'): 337.7086792, ('spectral_centroid', 'std'): 369.09136963, 'title': 'Alkan, 1ere Suite, Op 31, 4 Priere du soir, Assez l

In [7]:
#import sklearn as skl
#import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
#import librosa
#import librosa.display

In [8]:
features

{'Alkan': [{'title': 'Alkan, 1ere Suite, Op 31, 1 Lentement',
   ('chroma_stft', 'mean'): 0.23956035078000001,
   ('chroma_stft', 'std'): 0.22373536229000002,
   ('spectral_centroid', 'mean'): 343.8142395,
   ('spectral_centroid', 'std'): 382.49560547},
  {'title': 'Alkan, 1ere Suite, Op 31, 2 Assez lentement',
   ('chroma_stft', 'mean'): 0.3837146461,
   ('chroma_stft', 'std'): 0.34158027172,
   ('spectral_centroid', 'mean'): 341.66360474,
   ('spectral_centroid', 'std'): 341.1151123},
  {'title': 'Alkan, 1ere Suite, Op 31, 3 Dans le genre ancien, Tres lentement',
   ('chroma_stft', 'mean'): 0.30122366548,
   ('chroma_stft', 'std'): 0.29972076416,
   ('spectral_centroid', 'mean'): 317.69174194,
   ('spectral_centroid', 'std'): 327.80584717},
  {'title': 'Alkan, 1ere Suite, Op 31, 4 Priere du soir, Assez lentement',
   ('chroma_stft', 'mean'): 0.54739999771,
   ('chroma_stft', 'std'): 0.37772509456000003,
   ('spectral_centroid', 'mean'): 337.7086792,
   ('spectral_centroid', 'std'): 3

In [12]:
features['title': 'Alkan'}

SyntaxError: invalid syntax (<ipython-input-12-41f38555cb02>, line 1)

1. visualize
 - PCA to reduce dimention - use scipy
 -scatter plot of features
 -to visualize the data 
 -to see performance of the classifier
2. AdaBoost
3. Random Forrest