# FMA: A Dataset For Music Analysis

Kirell Benzi, Michaël Defferrard, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

## Dataset generation

This notebook ams at extracting and storing all the tracks of the [Free Music Archive](http://freemusicarchive.org/) to create a new dataset. The database `input` table was filled  manually using `fma_crawler.py`. It also downloads and creates 30sec clips from the songs.

In [1]:
import os
import time
import itertools
import numpy as np
import scipy as sp
import pandas as pd
import networkx as nx
import matplotlib as mpl
import matplotlib.pyplot as plt

import rethinkdb as r
import requests

# Customize plot colors for dark backgrounds
%matplotlib inline
mpl.rcParams['axes.edgecolor'] = 'grey'
mpl.rcParams['grid.color'] = '#66CCCC'
mpl.rcParams['text.color'] = '#0EBFE9'
mpl.rcParams['xtick.color'] = '#66CCCC'
mpl.rcParams['ytick.color'] = '#66CCCC'
mpl.rcParams['axes.labelcolor'] = '#0EBFE9'


import IPython
HOME_DIR = IPython.utils.path.get_home_dir()
ROOT_DIR = os.path.join(HOME_DIR, 'work/mjf')
NOTEBOOK_DIR = os.path.join(ROOT_DIR, 'notebook')
CODE_DIR = os.path.join(ROOT_DIR, 'mjf')

%load_ext autoreload
%autoreload 2
os.sys.path.append(os.path.dirname(os.path.abspath('.')))
os.sys.path.append(CODE_DIR)

import mjf
import mjf.crawlers.fma_crawler
import mjf.crawlers.song_crawler
import mjf.db
import mjf.utils
import mjf.graph
import mjf.plots

import IPython.utils.path
# DATA_DIR = os.path.join(IPython.utils.path.get_home_dir(), 'local/freemusicarchive/')
DATA_DIR = os.path.join(IPython.utils.path.get_home_dir(), 'data/freemusicarchive/')
print 'Data directory:', DATA_DIR
MUSIC_DIR = os.path.join(DATA_DIR, 'music/')
CLIP_DIR = os.path.join(DATA_DIR, 'clips/')

Data directory: /home/kikohs/data/freemusicarchive/


## 1 Large: full dataset

In [2]:
#a = r.db(conf['project']).table('input')
#a.get_all().run(conn)
#a.order_by('fma_id').keys().run(conn)

In [3]:
# Get dataframe from rethink DB
conf = mjf.utils.parse_config(os.path.join(ROOT_DIR, 'resources/config_fma.json'))
conn = mjf.db.get_connection(conf)
dataset_name = conf['project']
keys = ['artist_name', 'track_title', 'track_listens', 'track_genres']
df = mjf.db.get_all(dataset_name, conn, 'input', keys)

columns = {'artist_name': 'artist', 'track_title': 'title', 'track_genres': 'genres', 'track_listens': 'play_count'}
df.rename(columns=columns, inplace=True)

print('Number of songs: {}'.format(len(df)))
df.head()

Number of songs: 89912


Unnamed: 0_level_0,artist,genres,play_count,title
fma_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,AWOL,"[{u'genre_title': u'Hip-Hop', u'genre_url': u'...",874,Food
3,AWOL,"[{u'genre_title': u'Hip-Hop', u'genre_url': u'...",453,Electric Ave
5,AWOL,"[{u'genre_title': u'Hip-Hop', u'genre_url': u'...",1108,This World
10,Kurt Vile,"[{u'genre_title': u'Pop', u'genre_url': u'http...",42936,Freeway
20,Nicky Cook,"[{u'genre_title': u'Experimental Pop', u'genre...",288,Spiritual Level


In [4]:
# Full track on disk
tracks = list(mjf.utils.folder_walker(MUSIC_DIR))
print('Number of tracks: {}'.format(len(tracks)))
idx = pd.Series(tracks, map(lambda x: int(os.path.split(x)[1].split('.')[0][:]), tracks))

# Clips on disk
clips = list(mjf.utils.folder_walker(CLIP_DIR))
print('Number of clips: {}'.format(len(clips)))
idx = pd.Series(clips, map(lambda x: int(os.path.split(x)[1].split('.')[0][5:]), clips))

# Filter songs that have a clip on disk
df = df[df.index.isin(idx.index)]
print('Number of songs: {}'.format(len(df)))

Number of tracks: 89068
Number of clips: 86992
Number of songs: 86886


In [5]:
# Remove songs without genre.
print('Number of songs without genre: {}'.format(len(df)-len(df.genres.dropna())))
df = df.dropna()
print('Number of songs: {}'.format(len(df)))

# Extract canonical genre
df['genres'] = df['genres'].map(lambda row: map(lambda x: x['genre_title'], row))
df['top_genre'] = df['genres'].map(lambda x: x[0])
df.head()

Number of songs without genre: 6939
Number of songs: 79947


Unnamed: 0_level_0,artist,genres,play_count,title,top_genre
fma_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,AWOL,[Hip-Hop],874,Food,Hip-Hop
3,AWOL,[Hip-Hop],453,Electric Ave,Hip-Hop
5,AWOL,[Hip-Hop],1108,This World,Hip-Hop
10,Kurt Vile,[Pop],42936,Freeway,Pop
20,Nicky Cook,"[Experimental Pop, Singer-Songwriter]",288,Spiritual Level,Experimental Pop


In [6]:
# Remove genres with less than n songs.
n = 100
top_genres = df['top_genre'].value_counts()
print('Number of genres: {}'.format(len(top_genres)))
top_genres = top_genres[top_genres.values > n]
print('Number of genres: {}'.format(len(top_genres)))

# Filter songs that are not in the top genres
df = df[df.top_genre.isin(top_genres.index)]
print('Number of songs: {}'.format(len(df)))

#top_genres

Number of genres: 138
Number of genres: 68
Number of songs: 77643


In [7]:
path = os.path.join(DATA_DIR, 'datasets_mdeff', 'fma_large')
df.to_json(os.path.join(path, 'fma_large.json'))

## 2 Medium: echonest features, 20 unbalenced genres

In [8]:
if False:
    # Check echonest features
    res = r.db('fma').table('log').filter({'status_code': 0}).pluck('id').run(conn)
    idx = map(lambda x: x['id'], res)
    print('Number of songs without echonest features: {}'.format(len(idx)))

    df = df[~df.index.isin(error_ids)]
    print('Number of songs: {}'.format(len(df)))

In [9]:
df_echonest = mjf.db.get_from_songs_and_tracks(dataset_name, conn, mjf.db.ALL_KEYS)
print('Echonest: {}'.format(df_echonest.shape))
print('Dataset: {}'.format(df.shape))
df = df[df.index.isin(df_echonest.index)]
df = df.join(df_echonest.drop('title', axis=1))
print('Dataset: {}'.format(df.shape))
# album_name and album_date are often unknown (NaN)
df.head()

Echonest: (14570, 25)
Dataset: (77643, 5)
Dataset: (14511, 29)


Unnamed: 0_level_0,artist,genres,play_count,title,top_genre,acousticness,album_date,album_name,artist_discovery,artist_discovery_rank,...,liveness,release,song_currency,song_currency_rank,song_hotttnesss,song_hotttnesss_rank,speechiness,tempo,temporal_echonest_features,valence
fma_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,AWOL,[Hip-Hop],874,Food,Hip-Hop,0.416675,,,0.38899,,...,0.177647,AWOL - A Way Of Life,0.0,,0.0,,0.15931,165.922,"[0.87723326683, 0.588911116123, 0.354243010283...",0.576661
3,AWOL,[Hip-Hop],453,Electric Ave,Hip-Hop,0.374408,,,0.38899,,...,0.10588,AWOL - A Way Of Life,0.0,,0.0,,0.461818,126.957,"[0.534429132938, 0.537414252758, 0.44329947233...",0.26924
5,AWOL,[Hip-Hop],1108,This World,Hip-Hop,0.043567,,,0.38899,,...,0.373143,AWOL - A Way Of Life,0.0,,0.0,,0.124595,100.26,"[0.548092544079, 0.720191776752, 0.38925707340...",0.621661
10,Kurt Vile,[Pop],42936,Freeway,Pop,0.95167,2008-03-11,Constant Hitmaker,0.557339,2635.0,...,0.115474,Constant Hitmaker,0.005158,115691.0,0.354516,67609.0,0.032985,111.562,"[0.311404168606, 0.711402356625, 0.32191380858...",0.96359
134,AWOL,[Hip-Hop],880,Street Music,Hip-Hop,0.452217,,,0.38899,,...,0.096567,AWOL - A Way Of Life,0.0,,0.0,,0.525519,114.29,"[0.610849261284, 0.569169461727, 0.42849382758...",0.894072


In [10]:
df = df[~df.title.str.startswith('Untitled')]
print('Number of songs: {}'.format(len(df)))

Number of songs: 14511


In [11]:
def filter_bad_genres(df, bad_genres=['Avant-Garde', 
                                      'Experimental', 
                                      'Noise', 'Ambient', 'Garage', 'Sound Collage', 'Singer',
                                      'Audio Collage', 'Glitch', 'Unclassifiable', 'Lo-Fi',
                                      'Spoken', 'Poetry', 'Talk Radio', 'Field Recording']):
    def impl(genres):
        for g in genres:
            for b in bad_genres:
                if g.startswith(b):
                    return False
        return True
        
    return df[df['genres'].apply(impl)]

# Keep the top n genres.
nb_genres = 20
df = filter_bad_genres(df)

# Filter songs that are not in the selected genres
print('Number of songs: {}'.format(len(df)))
print('Number of genres: {}'.format(len(df['top_genre'].unique())))
top_genres = df['top_genre'].value_counts()
df = df[df.top_genre.isin(top_genres[:nb_genres].index)]
print('Number of genres: {}'.format(len(df['top_genre'].unique())))
print('Number of songs: {}'.format(len(df)))

df['top_genre'].value_counts()

Number of songs: 14511
Number of genres: 20
Number of genres: 20
Number of songs: 14511


Electronic             2962
Rock                   2385
Pop                    1636
Hip-Hop                1059
Folk                   1025
Punk                    882
Indie-Rock              601
Jazz                    588
Old-Time / Historic     460
Psych-Rock              456
International           431
Classical               408
Chiptune                254
Blues                   245
Psych-Folk              245
Post-Punk               222
Metal                   191
Soundtrack              172
Trip-Hop                171
Post-Rock               118
Name: top_genre, dtype: int64

In [12]:
#top_genres = ['Electronic', 'Rock', 'Pop', 'Folk', 'Punk', 'Hip-Hop', 'Soundtrack',
#              'Blues', 'Jazz', 'Indie-Rock', 'International', 'Classical', 
#              'Psych-Rock', 'Psych-Folk', 'Old-Time / Historic', 'Post-Rock', 'Metal', 'Chiptune', 'Post-Punk',
#              'Trip-Hop']
#
#top_genres = ['Electronic', 'Rock', 'Pop', 'Hip-Hop', 'Folk', 'Punk', 'Jazz', 'Indie-Rock', 'Psych-Rock',
#              'Old-Time / Historic', 'International', 'Classical', 'Chip Music', 'Country',
#              'Electroacoustic', 'Reggae - Dub', 'Chiptune', 'Blues', 'Psych-Folk', 'Soul-RnB']

In [13]:
def split_train_test(df):
    """Split between training and testing set. An artist only appears on one side."""
    for genre in df['top_genre'].unique():
        df_genre = df[df['top_genre'] == genre]
        train_target = int(len(df_genre) * 0.8)
        test_target = len(df_genre) - train_target
        #print(train_target, test_target)
        train_current, test_current = 0, 0
        artists = df_genre['artist'].value_counts()
        for artist, count in zip(artists.index, artists.values):
            df_artist = df_genre[df_genre['artist'] == artist]
            if float(train_target - train_current) / train_target >= float(test_target - test_current) / test_target:
                train = True
                train_current += count
            else:
                train = False
                test_current += count
            df.ix[(df['top_genre'] == genre) & (df['artist'] == artist), 'train'] = train
        #print(train_current, test_current)
    return df

def show_split(df):
    for genre in df.top_genre.unique():
        tmp = df[df['top_genre'] == genre]
        ntrain = sum(tmp['train'] == True)
        print(ntrain, ntrain * 1.0 / len(tmp), genre)

    ntrain = sum(df['train'] == True) 
    print(ntrain, ntrain * 1.0 / len(df), 'Overall')

df = split_train_test(df)
show_split(df)

(847, 0.79981114258734654, u'Hip-Hop')
(1308, 0.79951100244498774, u'Pop')
(820, 0.80000000000000004, u'Folk')
(470, 0.79931972789115646, u'Jazz')
(94, 0.79661016949152541, u'Post-Rock')
(705, 0.79931972789115646, u'Punk')
(2369, 0.79979743416610394, u'Electronic')
(1908, 0.80000000000000004, u'Rock')
(152, 0.79581151832460728, u'Metal')
(177, 0.79729729729729726, u'Post-Punk')
(196, 0.80000000000000004, u'Blues')
(344, 0.79814385150812062, u'International')
(364, 0.79824561403508776, u'Psych-Rock')
(480, 0.79866888519134771, u'Indie-Rock')
(196, 0.80000000000000004, u'Psych-Folk')
(326, 0.7990196078431373, u'Classical')
(368, 0.80000000000000004, u'Old-Time / Historic')
(136, 0.79532163742690054, u'Trip-Hop')
(137, 0.79651162790697672, u'Soundtrack')
(203, 0.79921259842519687, u'Chiptune')
(11600, 0.79939356350354906, 'Overall')


In [14]:
def create_dataset(df, path):
    from shutil import copyfile

    for index, row in df.iterrows():
        genre = row['top_genre']
        if genre.startswith('Old-Time'):
            genre = 'Old-Time'

        src = os.path.join(CLIP_DIR, 'clip_{}.mp3'.format(index))
        
        dst_folder = os.path.join(path, genre)
        if not os.path.exists(dst_folder):
            os.mkdir(dst_folder)

        #dst = os.path.join(dst_folder, '{:06d}.mp3'.format(index))
        dst = os.path.join(dst_folder, '{}.mp3'.format(index))
        copyfile(src, dst)

path = os.path.join(DATA_DIR, 'datasets_mdeff', 'fma_medium')
df.to_json(os.path.join(path, 'fma_medium.json'))
create_dataset(df, path)

## 3 Small: 10 balanced genres

In [15]:
nb_genres = 10

# Filter songs that are not in the selected genres
print('Number of songs: {}'.format(len(df)))
print('Number of genres: {}'.format(len(df['top_genre'].unique())))
top_genres = df['top_genre'].value_counts()
df = df[df.top_genre.isin(top_genres[:nb_genres].index)]
print('Number of genres: {}'.format(len(df['top_genre'].unique())))
print('Number of songs: {}'.format(len(df)))

df['top_genre'].value_counts()

Number of songs: 14511
Number of genres: 20
Number of genres: 10
Number of songs: 12054


Electronic             2962
Rock                   2385
Pop                    1636
Hip-Hop                1059
Folk                   1025
Punk                    882
Indie-Rock              601
Jazz                    588
Old-Time / Historic     460
Psych-Rock              456
Name: top_genre, dtype: int64

In [16]:
nb_samples = 400
df = df.groupby('top_genre').apply(lambda x: x.sample(nb_samples, random_state=42)).reset_index(level=0, drop=True)

print('Dataset: {}'.format(df.shape))
#df.head()

Dataset: (4000, 30)


In [17]:
df = split_train_test(df)
show_split(df)

(320, 0.80000000000000004, u'Electronic')
(320, 0.80000000000000004, u'Folk')
(320, 0.80000000000000004, u'Hip-Hop')
(320, 0.80000000000000004, u'Indie-Rock')
(320, 0.80000000000000004, u'Jazz')
(320, 0.80000000000000004, u'Old-Time / Historic')
(320, 0.80000000000000004, u'Pop')
(320, 0.80000000000000004, u'Psych-Rock')
(320, 0.80000000000000004, u'Punk')
(320, 0.80000000000000004, u'Rock')
(3200, 0.80000000000000004, 'Overall')


In [18]:
path = os.path.join(DATA_DIR, 'datasets_mdeff', 'fma_small')
df.to_json(os.path.join(path, 'fma_small.json'))
create_dataset(df, path)