# [CS3244 Project on music genre classificaiton](https://github.com/markusyeo/fma)

Markus Yeo, Sunny Low

## Creation

From `raw_*.csv`, this notebook generates:
* `tracks.csv`: per-track / album / artist metadata / filtered to fma_small.
* `genres.csv`: genre hierarchy.

In [None]:
import os
import ast
import pickle

import IPython.display as ipd
import numpy as np
import pandas as pd

from dotenv import dotenv_values 
import utils
import creation

In [None]:
AUDIO_DIR = dotenv_values()['AUDIO_DIR']
BASE_DIR = os.path.abspath(os.path.dirname(AUDIO_DIR))
FMA_SMALL = os.path.join(BASE_DIR, 'fma_small')

In [None]:
AUDIO_DIR

In [None]:
path = os.walk(AUDIO_DIR)
filelist = []
for root, dirs, files in path:
	for file in files:
        #append the file name to the list
		if '.mp3' in file:
			filelist.append(int(file[:-4]))
filelist = tuple(filelist)

In [None]:
filelist[:5]

In [None]:
# ./creation.py metadata
# ./creation.py data /path/to/fma/fma_full
# ./creation.py clips /path/to/fma

#!cat creation.py

In [None]:
# converters={'genres': ast.literal_eval}
tracks = pd.read_csv('./data/fma_metadata/raw_tracks.csv', index_col=0)
albums = pd.read_csv('./data/fma_metadata/raw_albums.csv', index_col=0)
artists = pd.read_csv('./data/fma_metadata/raw_artists.csv', index_col=0)
genres = pd.read_csv('./data/fma_metadata/raw_genres.csv', index_col=0)

# not_found = pickle.load(open('not_found.pickle', 'rb'))

In [None]:
for i in [98565, 98567, 98569, 99134, 108925, 133297]: 
    filelist = list(filelist)
    filelist.remove(i)
filelist = tuple(filelist)
tracks = tracks.loc[list(filelist)]
tracks = tracks.sort_index()

In [None]:
tracks.head()

In [None]:
tracks.columns

In [None]:
def get_fs_tids(audio_dir):
    tids = []
    for _, dirnames, files in os.walk(audio_dir):
        if dirnames == []:
            tids.extend(int(file[:-4]) for file in files)
    return tids

audio_tids = get_fs_tids(FMA_SMALL)
clips_tids = get_fs_tids(FMA_SMALL)

In [None]:
print('tracks: {} collected ({} max id)'.format(
    len(tracks), tracks.index.max()))
print('albums: {} collected ({} in tracks)'.format(
    len(albums), len(tracks['album_id'].unique())))
print('artists: {} collected ({} in tracks)'.format(
    len(artists), len(tracks['artist_id'].unique())))
print('genres: {} collected'.format(len(genres)))

print('audio: {} collected ({} not in tracks)'.format(
    len(audio_tids), len(set(audio_tids).difference(tracks.index))))
print('clips: {} collected ({} not in tracks)'.format(
    len(clips_tids), len(set(clips_tids).difference(tracks.index))))
# assert sum(tracks.index.isin(audio_tids)) == len(tracks)
# assert sum(tracks.index.isin(clips_tids)) == sum(tracks.index.isin(audio_tids))
# assert len(clips_tids) == len(tracks)

In [None]:
N = 5
ipd.display(tracks.head(N))
ipd.display(albums.head(N))
ipd.display(artists.head(N))
ipd.display(genres.head(N))

## 2 Format metadata

Todo:
* Sanitize values, e.g. list of words for tags, valid links in `artist_wikipedia_page`, remove html markup in free-form text.
    * Clean tags. E.g. some tags are just artist names.
* Fill metadata about encoding: length, number of samples, sample rate, bit rate, channels (mono/stereo), 16bits?.
* Update duration from audio
    * 2624 is marked as 05:05:50 (18350s) although it is reported as 00:21:15.15 by ffmpeg.
    * 112067: 3714s --> 01:59:55.06, 112808: 3718s --> 01:59:59.56
    * ffmpeg: Estimating duration from bitrate, this may be inaccurate
    * Solution, decode the complete mp3: `ffmpeg -i input.mp3 -f null -`

In [None]:
df, column = tracks, 'tags'
null = sum(df[column].isnull())
print('{} null, {} non-null'.format(null, df.shape[0] - null))
df[column].value_counts().head(10)

### 2.1 Tracks

In [None]:
#Helper functions
def convert_datetime(df, column, format=None):
    df[column] = pd.to_datetime(df[column], infer_datetime_format=True, format=format)

def convert_genres(genres):
    genres = ast.literal_eval(genres)
    return [int(genre['genre_id']) for genre in genres]

In [None]:
drops = [
    'artist_website', 'license_image_file', 'license_image_file_large', 
    'license_parent_id', 'license_title', 'license_url', 'track_copyright_p', 'track_copyright_c', 
    'track_file', 'track_image_file',  # used to download only
    'track_url', 'album_url', 'artist_url',  # only relevant on website
    'track_composer', 'track_lyricist', 'track_publisher',  # present for ~4000, <1000 and <2000 tracks
    'track_disc_number',  # different from 1 for <1000 tracks
    'track_explicit', 'track_explicit_notes',  # present for <4000 tracks
    'track_instrumental'  # ~6000 tracks have a 1, there is an instrumental genre
]
tracks.drop(drops, axis=1, inplace=True)
tracks.rename(columns={'tags': 'track_tags'}, inplace=True)

In [None]:
tracks['track_duration'] = tracks['track_duration'].map(creation.convert_duration)
convert_datetime(tracks, 'track_date_created')
convert_datetime(tracks, 'track_date_recorded')
tracks['album_id'].fillna(-1, inplace=True)
tracks['track_bit_rate'].fillna(-1, inplace=True)
tracks = tracks.astype({'album_id': int, 'track_bit_rate': int})
tracks['track_genres'].fillna('[]', inplace=True)
tracks['track_genres'] = tracks['track_genres'].map(convert_genres)

In [None]:
tracks.columns

### 2.2 Albums

In [None]:
albums.columns

In [None]:
drop = [
    'artist_name', 'album_url', 'artist_url',  # in tracks already (though it can be different)
    'album_handle',
    'album_image_file', 'album_images',  # todo: shall be downloaded
    #'album_producer', 'album_engineer',  # present for ~2400 albums only
]
albums.drop(drop, axis=1, inplace=True)
albums.rename(columns={'tags': 'album_tags'}, inplace=True)
convert_datetime(albums, 'album_date_created')
convert_datetime(albums, 'album_date_released')

In [None]:
albums.columns

### 2.3 Artists

In [None]:
artists.columns

In [None]:
drop = [
    'artist_website', 'artist_url',  # in tracks already (though it can be different)
    'artist_handle',
    'artist_image_file', 'artist_images',  # todo: shall be downloaded
    'artist_donation_url', 'artist_paypal_name', 'artist_flattr_name',  # ~1600 & ~400 & ~70, not relevant
    'artist_contact',  # ~1500, not very useful data
    'artist_active_year_begin', 'artist_active_year_end',  # ~1400, ~500 only
    'artist_associated_labels',  # ~1000
    'artist_related_projects',  # only ~800, but can be combined with bio
]
artists.drop(drop, axis=1, inplace=True)
artists.rename(columns={'tags': 'artist_tags'}, inplace=True)

In [None]:
# convert_datetime(artists, 'artist_date_created')
# for column in ['artist_active_year_begin', 'artist_active_year_end']:
#     artists[column].replace(0.0, np.nan, inplace=True)
#     convert_datetime(artists, column, format='%Y.0')

In [None]:
artists.columns

### 2.4 Merge DataFrames

In [None]:
tracks = tracks.merge(albums, left_on='album_id', right_index=True, sort=False, how='left', suffixes=('', '_dup'))

n = tracks['album_title_dup'].isnull().sum()
print('{} tracks without extended album information ({} tracks without album_id)'.format(n, sum(tracks['album_id'] == -1)))
# assert sum(tracks['album_id'].isin(not_found['albums'])) == n
# assert sum(tracks['album_title'] != tracks['album_title_dup']) == n

tracks.drop('album_title_dup', axis=1, inplace=True)
# assert not any('dup' in col for col in tracks.columns)

In [None]:
# Album artist can be different than track artist. Keep track artist.
#tracks[tracks['artist_name'] != tracks['artist_name_dup']].select(lambda x: 'artist_name' in x, axis=1)

In [None]:
tracks = tracks.merge(artists, left_on='artist_id', right_index=True, sort=False, how='left', suffixes=('', '_dup'))

n = tracks['artist_name_dup'].isnull().sum()
print('{} tracks without extended artist information'.format(n))
# assert sum(tracks['artist_id'].isin(not_found['artists'])) == n
assert sum(tracks['artist_name'] != tracks[('artist_name_dup')]) == n

tracks.drop('artist_name_dup', axis=1, inplace=True)
# assert not any('dup' in col for col in tracks.columns)

In [None]:
columns = []
for name in tracks.columns:
    names = name.split('_')
    columns.append((names[0], '_'.join(names[1:])))
tracks.columns = pd.MultiIndex.from_tuples(columns)
# assert all(label in ['track', 'album', 'artist'] for label in tracks.columns.get_level_values(0))

In [None]:
# Todo: fill other columns ?
tracks['album', 'tags'].fillna('[]', inplace=True)
tracks['artist', 'tags'].fillna('[]', inplace=True)

columns = [('album', 'favorites'), ('album', 'comments'), ('album', 'listens'), ('album', 'tracks'),
           ('artist', 'favorites'), ('artist', 'comments')]
for column in columns:
    tracks[column].fillna(-1, inplace=True)
columns = {column: int for column in columns}
tracks = tracks.astype(columns)

## 3 Data cleaning

Todo: duplicates (metadata and audio)

In [None]:
def keep(index, df):
    old = len(df)
    df = df.loc[index]
    new = len(df)
    print('{} lost, {} left'.format(old - new, new))
    return df

tracks = keep(tracks.index, tracks)

Errors from the `features.py` script.
* IndexError('index 0 is out of bounds for axis 0 with size 0',)
    * ffmpeg: Header missing
    * ffmpeg: Could not find codec parameters for stream 0 (Audio: mp3, 0 channels, s16p): unspecified frame size. Consider increasing the value for the 'analyzeduration' and 'probesize' options
    * tids: 117759
* NoBackendError()
    * ffmpeg: Format mp3 detected only with low score of 1, misdetection possible!
    * tids: 80015, 115235
* UserWarning('Trying to estimate tuning from empty frequency set.',)
    * librosa error
    * tids: 1440, 26436, 38903, 57603, 62095, 62954, 62956, 62957, 62959, 62971, 86079, 96426, 104623, 106719, 109714, 114501, 114528, 118003, 118004, 127827, 130298, 130296, 131076, 135804, 154923
* ParameterError('Filter pass-band lies beyond Nyquist',)
    * librosa error
    * tids: 152204, 28106, 29166, 29167, 29169, 29168, 29170, 29171, 29172, 29173, 29179, 43903, 56757, 59361, 75461, 92346, 92345, 92347, 92349, 92350, 92351, 92353, 92348, 92352, 92354, 92355, 92356, 92358, 92359, 92361, 92360, 114448, 136486, 144769, 144770, 144771, 144773, 144774, 144775, 144778, 144776, 144777

In [None]:
# Too short a clip, known issue in wiki page
FAILED = [98565, 98567, 98569, 99134, 108925, 133297]
tracks = keep(tracks.index.difference(FAILED), tracks)

In [None]:
sum(tracks['track', 'title'].duplicated())

## 4 Genres

In [None]:
genres.drop(['genre_handle', 'genre_color'], axis=1, inplace=True)
genres.rename(columns={'genre_parent_id': 'parent', 'genre_title': 'title'}, inplace=True)

In [None]:
genres['parent'].fillna(0, inplace=True)
genres = genres.astype({'parent': int})

In [None]:
# 13 (Easy Listening) has parent 126 which is missing
# --> a root genre on the website, although not in the genre menu
genres.at[13, 'parent'] = 0

# 580 (Abstract Hip-Hop) has parent 1172 which is missing
# --> listed as child of Hip-Hop on the website
genres.at[580, 'parent'] = 21

# 810 (Nu-Jazz) has parent 51 which is missing
# --> listed as child of Easy Listening on website
genres.at[810, 'parent'] = 13

# 763 (Holiday) has parent 763 which is itself
# --> listed as child of Sound Effects on website
genres.at[763, 'parent'] = 16

# Todo: should novelty be under Experimental? It is alone on website.

In [None]:
# Genre 806 (hiphop) should not exist. Replace it by 21 (Hip-Hop).
print('{} tracks have genre 806'.format(
    sum(tracks['track', 'genres'].map(lambda genres: 806 in genres))))
def change_genre(genres):
    return [genre if genre != 806 else 21 for genre in genres]
tracks['track', 'genres'] = tracks['track', 'genres'].map(change_genre)
genres.drop(806, inplace=True)

In [None]:
def get_parent(genre, track_all_genres=None):
    parent = genres.at[genre, 'parent']
    if track_all_genres is not None:
        track_all_genres.append(genre)
    return genre if parent == 0 else get_parent(parent, track_all_genres)

# Get all genres, i.e. all genres encountered when walking from leafs to roots.
def get_all_genres(track_genres):
    track_all_genres = list()
    for genre in track_genres:
        get_parent(genre, track_all_genres)
    return list(set(track_all_genres))

tracks['track', 'genres_all'] = tracks['track', 'genres'].map(get_all_genres)

In [None]:
# Number of tracks per genre.
def count_genres(subset=tracks.index):
    count = pd.Series(0, index=genres.index)
    for _, track_all_genres in tracks.loc[subset, ('track', 'genres_all')].items():
        for genre in track_all_genres:
            count[genre] += 1
    return count

genres['#tracks'] = count_genres()
genres[genres['#tracks'] == 0]

In [None]:
def get_top_genre(track_genres):
    top_genres = set(genres.at[genres.at[genre, 'top_level'], 'title'] for genre in track_genres)
    return top_genres.pop() if len(top_genres) == 1 else np.nan

# Top-level genre.
genres['top_level'] = genres.index.map(get_parent)
tracks['track', 'genre_top'] = tracks['track', 'genres'].map(get_top_genre)

In [None]:
genres.head(10)

## 5 Small Dataset

Main characteristic: genre balanced.

Choices:
* 8 genres with 1000 tracks --> 8,000 tracks

In [None]:
fma_small = pd.DataFrame(tracks)

In [None]:
SUBSETS = ['small']
tracks['set', 'subset'] = pd.Series().astype(pd.CategoricalDtype( categories=SUBSETS, ordered=True))
tracks.loc[fma_small.index, ('set', 'subset')] = 'small'

## 6 Splits: training, validation, test

Take into account:
* Artists may only appear on one side.
* Stratification: ideally, all characteristics (#tracks per artist, duration, sampling rate, information, bio) and targets (genres, tags) should be equally distributed.

In [None]:
tracks.loc[training.index, ('set', 'split')]

In [None]:
from fast_ml.model_development import train_valid_test_split
training, _, validation, _, test, _ = train_valid_test_split(temp_tracks, train_size=0.8, valid_size=0.1, test_size=0.1, target=('track','genre_top'))

In [None]:
splits

In [None]:
np.split(temp_tracks.sample(frac=1, random_state=42), [splits[0], splits[1]+splits[0]])


In [None]:
def get_genre_id(genre):
    return genres[genres['title']==genre].index[0]

for genre in [get_genre_id(g) for g in tracks[('track', 'genre_top')]]:
    tracks['genre', genres.at[genre, 'title']] = tracks['track', 'genres_all'].map(lambda genres: genre in genres)
    
SPLITS = ('training', 'test', 'validation')
PERCENTAGES = (0.8, 0.1, 0.1)
tracks['set', 'split'] = pd.Series(dtype = pd.CategoricalDtype(categories=SPLITS))

genre_list = list(tracks[('track', 'genre_top')].unique())

while True:
    if len(genre_list) == 0:
        break

    # Choose most constrained genre, i.e. genre with the least unassigned artists.
    tracks_unsplit = tracks['set', 'split'].isnull()
    temp = tracks.loc[tracks.index.intersection(tracks_unsplit.index, sort=False)].set_index(('artist', 'id'), append=True)['genre']
    count = temp.groupby(level=1).sum().astype(bool).sum()
    genre = count[genre_list].idxmin()
    if genre in genre_list:
        genre_list.remove(genre)
    
    # Given genre, select artists.
    tracks_genre = tracks[tracks[('genre', genre)] == True]
    
    artists = tracks.loc[temp.index.get_level_values(level='track_id').intersection(tracks_genre.index, sort=False), ('artist', 'id')].value_counts()
    #print('-->', genre, len(artists))

    current = {split: np.count_nonzero(tracks.index.intersection(tracks_genre.index, sort=False).intersection(tracks[tracks['set', 'split'] == split].index)) for split in SPLITS}
    # print(current)

    # Only classify artists with more than 10 songs into training, validation, and test sets.
    remainder = []
    for artist, count in artists.items():
        temp_tracks = tracks.loc[tracks.loc[tracks['artist', 'id'] == artist].index.intersection(tracks_genre.index, sort=False)]
        if len(temp_tracks) >= 10:
            training, validation, test = np.split(temp_tracks.sample(frac=1, random_state=42), [int(.8*len(temp_tracks)), int(.9*len(temp_tracks))])
            current['training'] += len(training)
            tracks.loc[training.index, ('set', 'split')] = 'training'
            current['test'] += len(test)
            tracks.loc[test.index, ('set', 'split')] = 'test'
            current['validation'] += len(validation)
            tracks.loc[validation.index, ('set', 'split')] = 'validation'
        else:
            continue
    temp_tracks = tracks_genre[tracks[('set','split')].isnull()]
    splits = [len(tracks_genre)*percentage for percentage in PERCENTAGES]
    splits = [int(split) - current[SPLITS[i]] for i, split in enumerate(splits)]
    print(splits)
    training, validation, test = np.split(temp_tracks.sample(frac=1, random_state=42), [splits[0], splits[2]+splits[0]])
    current['training'] += len(training)
    tracks.loc[training.index, ('set', 'split')] = 'training'
    current['test'] += len(test)
    tracks.loc[test.index, ('set', 'split')] = 'test'
    current['validation'] += len(validation)
    tracks.loc[validation.index, ('set', 'split')] = 'validation'
    
    print(current)
# Not needed any more.
tracks.drop('genre', axis=1, level=0, inplace=True)

In [None]:
tracks['set','split'].value_counts()

## 7 Store

In [None]:
os.chdir('./data/fma_metadata')

In [None]:
for dataset in 'tracks', 'genres':
    eval(dataset).sort_index(axis=0, inplace=True)
    eval(dataset).sort_index(axis=1, inplace=True)
    params = dict(float_format='%.10f') if dataset == 'echonest' else dict()
    eval(dataset).to_csv(dataset + '.csv', **params)

In [None]:
tracks.columns

In [None]:
# ./creation.py normalize /path/to/fma
# ./creation.py zips /path/to/fma

## 8 Description

In [None]:
import utils
tracks = utils.load('tracks.csv')
tracks.dtypes

In [None]:
N = 5
ipd.display(tracks['track'].head(N))
ipd.display(tracks['album'].head(N))
ipd.display(tracks['artist'].head(N))