 Source: http://millionsongdataset.com/sites/default/files/tutorial1.py.txt

In [2]:
# imports
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np
import pandas as pd

In [25]:
# path to the Million Song Dataset subset (uncompressed)
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_subset_path='MillionSongSubset'
#msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')
assert os.path.isdir(msd_subset_path),'wrong path' # sanity check

In [3]:
# path to the Million Song Dataset code
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_code_path='MSongsDB'
assert os.path.isdir(msd_code_path),'wrong path' # sanity check

In [4]:
# we add some paths to python so we can import MSD code
sys.path.append( os.path.join(msd_code_path,'PythonSrc') )

In [5]:
# imports specific to the MSD
import hdf5_getters as GETTERS

In [6]:
# the following function simply gives us a nice string for
# a time lag in seconds
def strtimedelta(starttime,stoptime):
    return str(datetime.timedelta(seconds=stoptime-starttime))

In [7]:
# we define this very useful function to iterate the files
def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, apply the
    given function 'func' to all of them.
    If no 'func' is passed, we do nothing except counting.
    INPUT
       basedir  - base directory of the dataset
       func     - function to apply to all filenames
       ext      - extension, .h5 by default
    RETURN
       number of files
    """
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # count files
        cnt += len(files)
        # apply function to all files
        for f in files :
            func(f)       
    return cnt

In [8]:
# we can now easily count the number of files in the dataset
print('number of song files:',apply_to_all_files(msd_subset_path))

number of song files: 10000


In [10]:
# let's now get all artist names in a set(). One nice property:
# if we enter many times the same artist, only one will be kept.
all_artist_names = set()

In [11]:
# we define the function to apply to all files
def func_to_get_artist_name(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_name(h5)
    all_artist_names.add( artist_name )
    h5.close()

In [14]:
# let's apply the previous function to all files
# we'll also measure how long it takes
t1 = time.time()
apply_to_all_files(msd_subset_path,func=func_to_get_artist_name)
t2 = time.time()
print('all artist names extracted in:',strtimedelta(t1,t2))

all artist names extracted in: 0:03:10.560690


In [22]:
# check N artist names
N = 10
list(all_artist_names)[:5]

[b'Janet Jackson Featuring Kanye West',
 b'Randy Stonehill',
 b'Scott Matthews',
 b'Estrellas Cubanas',
 b'Ray Pillow']

In [24]:
# or 
# let's see some of the content of 'all_artist_names'
print('found',len(all_artist_names),'unique artist names')
for k in range(5):
    print(list(all_artist_names)[k])

found 4412 unique artist names
b'Janet Jackson Featuring Kanye West'
b'Randy Stonehill'
b'Scott Matthews'
b'Estrellas Cubanas'
b'Ray Pillow'


In [26]:

# this is too long, and the work of listing artist names has already
# been done. Let's redo the same task using an SQLite database.
# We connect to the provided database: track_metadata.db

#conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
#                                    'subset_track_metadata.db'))

OperationalError: unable to open database file

# Trying to produce User profiles from the train_triplets.txt

The original file contains
- 1,019,318 unique users
- 48,373,586 user-song.play count triplets

A subset of 1000 triplets can be found in triplets_1000.txt, where each line is in the format:
    
    userID \tab songID \tab play_count

Read in the data:

In [9]:
user_profiles = pd.read_csv('triplets_1000.txt', sep='\t', names = ['userID','songID', 'play_count'])

In [10]:
user_profiles

Unnamed: 0,userID,songID,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
995,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYPJMP12AF72A901D,1
996,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYRHNG12A8C14002E,1
997,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYVSHP12A6702016E,2
998,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYYYFE12A81C2395E,1


Pivot to tranform the data from long to wide:

In [11]:
user_profiles.pivot(index='userID', columns='songID', values='play_count')

songID,SOAARXR12A8C133D15,SOABRAB12A6D4F7AAF,SOACPBY12A8C13FEF9,SOACWYB12AF729E581,SOADGFH12A8C143D89,SOADQPP12A67020C82,SOAFOBL12AF72A25BA,SOAFPAX12AB0187A17,SOAFTRR12AF72A8D4D,SOAIILB12A58A776F7,...,SOZMJFG12AB017BDAF,SOZMNAX12A58A77F88,SOZNBQP12A6310D8AA,SOZOBWN12A8C130999,SOZPQES12A6D4F8E57,SOZRBOZ12A58A7AD7E,SOZRLJL12A8C14415F,SOZVCRW12A67ADA0B7,SOZWVEH12A6D4F7C37,SOZZHXI12A8C13BF7D
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17aa9f6dbdf753831da8f38c71b66b64373de613,1.0,,1.0,,,,,,,,...,,,,,,,1.0,,,
4bd88bfb25263a75bbdd467e74018f4ae570e5df,,,,,,,,,,,...,,,,,,,,,,
5a905f000fc1ff3df7ca807d57edb608863db05d,,,,,11.0,,12.0,,1.0,3.0,...,,,,,,,,,,
85c1f87fea955d09b4bec2e36aee110927aedf9a,,,,2.0,,,,,,,...,,,,,,,,,,
8937134734f869debcab8f23d77465b4caaa85df,,,,,,,,6.0,,,...,,,,,,,,,,
969cc6fb74e076a68e36a04409cb9d3765757508,,2.0,,,,,,,,,...,,,,,,1.0,,,1.0,
9bb911319fbc04f01755814cb5edb21df3d1a336,,,,,,,,,,,...,,,,,,,,,,
9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,,,,,,,,,,,...,,,,,,,,,,
b64cdd1a0bd907e5e00b39e345194768e330d652,,,,,,,,,3.0,,...,,2.0,,,2.0,,,,,
b80344d063b5ccb3212f76538f3d9e43d87dca9e,,,,,,,,,,,...,,,,1.0,,,,,,1.0
