In [15]:
import os

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
from sklearn.preprocessing import MinMaxScaler

METADATA_DIR = "./fma_metadata/"

In [16]:
def load(filepath):
    """
    Based off code from the fma github
    """
    filename = os.path.basename(filepath)
    if "features" in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])
    if "echonest" in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])
    if "genres" in filename:
        return pd.read_csv(filepath, index_col=0)
    if "tracks" in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [("track", "tags"), ("album", "tags"), ("artist", "tags"),
                   ("track", "genres"), ("track", "genres_all")]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [("track", "date_created"), ("track", "date_recorded"),
                   ("album", "date_created"), ("album", "date_released"),
                   ("artist", "date_created"), ("artist", "active_year_begin"),
                   ("artist", "active_year_end")]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ("small", "medium", "large")
        tracks["set", "subset"] = tracks["set", "subset"].astype(
                "category", categories=SUBSETS, ordered=True)

        COLUMNS = [("track", "genre_top"), ("track", "license"),
                   ("album", "type"), ("album", "information"),
                   ("artist", "bio")]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype("category")

        return tracks

In [17]:
def extract_features(data):
    features = load(METADATA_DIR + "features.csv")

    keywords = [("spectral_centroid", "mean"), ("spectral_centroid", "std"), ("chroma_stft", "mean"), ("chroma_stft", "std")]
    feature_table = {}
    for composer in data:
        feature_table[composer] = []
        tracks = data[composer]
        for track in tracks:
            track_dict = {}
            tid = track.name
            track_dict["title"] = track["track", "title"]
            for keyword in keywords:
                try:
                    track_dict[keyword] = features[keyword].loc[[tid]]["01"].item()
                except KeyError:
                    track_dict[feature] = None
                    continue
            feature_table[composer] += [track_dict]
    return feature_table

In [18]:
def load_data(composers_to_learn=None):
    tracks = load(METADATA_DIR + "tracks.csv")
    #genres = load(METADATA_DIR + "genres.csv")

    tracks = tracks[tracks["track", "genre_top"] == "Classical"]
    tracks = tracks[tracks["track", "composer"].notnull()]
    composer_dict = {}
    for index, row in tracks.iterrows():
        composer = row["track", "composer"]
        if composers_to_learn:
            for c in composers_to_learn:
                if c in composer:
                    composer = composers_to_learn[composers_to_learn.index(c)]
        if composers_to_learn and composer not in composers_to_learn:
            continue
        if composer not in composer_dict:
            composer_dict[composer] = [row]
        else:
            composer_dict[composer] += [row]

    return composer_dict

In [19]:
def get_learning_data(data):
    """
    Make data usable by sklearn
    """
    composer_era_map = {"Bach":"Baroque", "Haydn":"Classical",
                        "Alkan":"Romantic", "Orff":"Modern"}
    x = []
    y = []
    for composer in data:
        for d in data[composer]:
            y.append(composer)
            vector = []
            for feature in d:
                if feature != "title":
                    vector.append(d[feature])
            x.append(vector)
    return x, y

In [20]:
def count_data(composer_dict):
    count_dict = {}
    for composer in composer_dict:
        count_dict[composer] = len(composer_dict[composer])
    return count_dict

In [21]:
composers_to_learn = ["Bach", "Haydn", "Alkan", "Orff"]
composer_data = load_data(composers_to_learn)
count_dict = count_data(composer_data)
for c in count_dict:
    print(c + ": " + str(count_dict[c]))

  


Alkan: 25
Orff: 24
Bach: 348
Haydn: 44


In [24]:
features = extract_features(composer_data)

In [27]:
x, y = get_learning_data(features)
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
print(x)
print(len(x), len(y))

[[0.07084949 0.32044548 0.20633779 0.28088247]
 [0.06963015 0.27990858 0.50492548 0.72841942]
 [0.05603891 0.26687064 0.33406139 0.56945052]
 ...
 [0.28225449 0.22307405 0.39757253 0.57968829]
 [0.38078435 0.34880953 0.81390657 0.74882152]
 [0.32410215 0.15655234 0.63408223 0.80394293]]
441 441


1. visualize
 - PCA to reduce dimention - use scipy
 -scatter plot of features
 -to visualize the data 
 -to see performance of the classifier
2. AdaBoost
3. Random Forrest