In [1]:
"""
Tutorial for the Million Song Dataset

by Thierry Bertin-Mahieux (2011) Columbia University
   tb2332@columbia.edu
   Copyright 2011 T. Bertin-Mahieux, All Rights Reserved

This tutorial will walk you through a quick experiment
using the Million Song Dataset (MSD). We will actually be working
on the 10K songs subset for speed issues, but the code should
transpose seamlessly.

In this tutorial, we do simple metadata analysis. We look at
which artist has the most songs by iterating over the whole
dataset and using an SQLite database.

You need to have the MSD code downloaded from GITHUB.
See the MSD website for details:
http://labrosa.ee.columbia.edu/millionsong/

If you have any questions regarding the dataset or this tutorial,
please first take a look at the website. Send us an email
if you haven't found the answer.

Note: this tutorial is developed using Python 2.6
      on an Ubuntu machine. PDF created using 'pyreport'.
"""

# usual imports
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np
from tqdm import tqdm

In [2]:
import sys 
sys.path.append('/Users/alexkirnas/MusicRecommendationService/ML')

In [3]:
# path to the Million Song Dataset subset (uncompressed)
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_subset_path='/Users/alexkirnas/MusicRecommendationService/ML'
msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')
assert os.path.isdir(msd_subset_path),'wrong path' # sanity check

In [4]:
# imports specific to the MSD
import utils.hdf5_getters as GETTERS

In [None]:
# the following function simply gives us a nice string for
# a time lag in seconds
def strtimedelta(starttime,stoptime):
    return str(datetime.timedelta(seconds=stoptime-starttime))

# we define this very useful function to iterate the files
def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, apply the
    given function 'func' to all of them.
    If no 'func' is passed, we do nothing except counting.
    INPUT
       basedir  - base directory of the dataset
       func     - function to apply to all filenames
       ext      - extension, .h5 by default
    RETURN
       number of files
    """
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # count files
        cnt += len(files)
        # apply function to all files
        for f in files :
            func(f)       
    return cnt

In [None]:
# we can now easily count the number of files in the dataset
print('number of song files:',apply_to_all_files(msd_subset_data_path))

In [None]:
# let's now get all artist names in a set(). One nice property:
# if we enter many times the same artist, only one will be kept.
all_artist_names = set()

# we define the function to apply to all files
def func_to_get_artist_name(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    for song_index in tqdm(range(len(h5.root.metadata.songs))):
        artist_name = GETTERS.get_artist_name(h5, songidx=song_index)
        all_artist_names.add( artist_name )
    h5.close()
    
# let's apply the previous function to all files
# we'll also measure how long it takes
t1 = time.time()
apply_to_all_files(msd_subset_data_path,func=func_to_get_artist_name)
t2 = time.time()
print ('all artist names extracted in:',strtimedelta(t1,t2))

In [None]:
# let's see some of the content of 'all_artist_names'
print ('found', len(all_artist_names), 'unique artist names')
print (list(all_artist_names)[:5])

In [5]:
h5 = GETTERS.open_h5_file_read('../data/msd_summary_file.h5')

In [None]:
h5.root.metadata.songs

In [None]:
GETTERS.get_title(h5, songidx=100), GETTERS.get_artist_name(h5, songidx=100), GETTERS.get_artist_id(h5, songidx=100)

In [None]:
GETTERS.get_song_id(h5, songidx=100)

In [8]:
all_data = []

# not in h5-flie
skip_function_names = [
    'get_num_songs'
]  

function_names = [name for name in dir(GETTERS) if name.startswith('get_') and not name in skip_function_names]
fields_names = [name[4:] for name in function_names]

for song_index in tqdm(range(len(h5.root.metadata.songs))):
    if song_index == 10000:
        break
        
    song_dict = {}
    for func_index, func_name in enumerate(function_names):
        getter_func = getattr(GETTERS, func_name)
        
        value = getter_func(h5, songidx=song_index)
        key = fields_names[func_index]
        
        song_dict[key] = value

    all_data.append(song_dict)


  0%|          | 0/1000000 [00:00<?, ?it/s][A
  0%|          | 14/1000000 [00:00<2:05:48, 132.47it/s][A
  0%|          | 29/1000000 [00:00<2:02:22, 136.18it/s][A
  0%|          | 43/1000000 [00:00<2:01:53, 136.73it/s][A
  0%|          | 58/1000000 [00:00<2:01:05, 137.63it/s][A
  0%|          | 72/1000000 [00:00<2:02:08, 136.44it/s][A
  0%|          | 86/1000000 [00:00<2:02:14, 136.33it/s][A
  0%|          | 100/1000000 [00:00<2:02:31, 136.02it/s][A
  0%|          | 114/1000000 [00:00<2:02:55, 135.57it/s][A
  0%|          | 128/1000000 [00:00<2:03:48, 134.60it/s][A
  0%|          | 143/1000000 [00:01<2:01:55, 136.67it/s][A
  0%|          | 157/1000000 [00:01<2:02:02, 136.54it/s][A
  0%|          | 171/1000000 [00:01<2:04:05, 134.29it/s][A
  0%|          | 185/1000000 [00:01<2:05:38, 132.63it/s][A
  0%|          | 199/1000000 [00:01<2:10:22, 127.81it/s][A
  0%|          | 212/1000000 [00:01<2:11:14, 126.96it/s][A
  0%|          | 225/1000000 [00:01<2:11:51, 126.37it/s][

In [11]:
import pandas as pd

In [12]:
result_df = pd.DataFrame.from_dict(all_data)
result_df.to_csv('../data/msd_summary_file_10k.csv')