# PROJECT - *My Way* of seeing music covers
#### Pierre-Antoine Desplaces, Anaïs Ladoy, Lou Richard

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from io import StringIO
import sys

## Data organisation 
- All the additional files were downloaded from the cluster giving all the metadata of the Million Songs dataset. They will help to elaborate a plan and a script will then search more information about a specific track (h5 files in the cluster) maybe using cluster cpu. The path to access to a track in the cluster is for example million-songs/data/A/A/A (with the 3 letters at the end being the 3rd, 4th and 5th letter on the track id).
- The music covers will be detected using another dataset (SecondHandSongs), we have the choice to use the downloadable dataset containing 18,196 tracks (all with a connection to the MSD dataset), or to web-scrapp the SHS website (https://secondhandsongs.com/) where we have much more information (522 436 covers) but not necessarly connected to our MSD dataset. The SHS API is RESTful (return a JSON object) and we are limited to 100 requests per minute and 1000 requestion per hour but we can contact them to remove limitation.
- Some artist are geolocalised (30% of the MSD total artists) on the artist_location dataframe.

In [4]:
#Load Additional files
tracks_per_year=pd.read_csv('data/AdditionalFiles/tracks_per_year.txt',delimiter='<SEP>',engine='python',header=None,index_col=1,names=['year','trackID','artist','title'])
unique_tracks=pd.read_csv('data/AdditionalFiles/unique_tracks.txt',delimiter='<SEP>',engine='python',header=None,index_col=0,names=['trackID','songID','artist','title'])
unique_artists=pd.read_csv('data/AdditionalFiles/unique_artists.txt',delimiter='<SEP>',engine='python',header=None,index_col=0,names=['artistID','artistMID','randomTrack','name'])
artist_location=pd.read_csv('data/AdditionalFiles/artist_location.txt',delimiter='<SEP>',engine='python',header=None,index_col=0,names=['artistID','lat','long','name','location'])

In [5]:
#Check if indexes is unique and print the number of elements for each dataframe
print('Dataframe (Unique index, Number of elements)')
print('tracks_per_year ',(tracks_per_year.index.is_unique,tracks_per_year.shape[0]))
print('unique_tracks ',(unique_tracks.index.is_unique,unique_tracks.shape[0]))
print('unique_artists ',(unique_artists.index.is_unique,unique_artists.shape[0]))
print('artist_location ',(artist_location.index.is_unique,artist_location.shape[0]))

Dataframe (Unique index, Number of elements)
tracks_per_year  (True, 515576)
unique_tracks  (True, 1000000)
unique_artists  (True, 44745)
artist_location  (True, 13850)


In [6]:
tracks_per_year.head()

Unnamed: 0_level_0,year,artist,title
trackID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TRSGHLU128F421DF83,1922,Alberta Hunter,Don't Pan Me
TRMYDFV128F42511FC,1922,Barrington Levy,Warm And Sunny Day
TRRAHXQ128F42511FF,1922,Barrington Levy,Looking My Love
TRFAFTK12903CC77B8,1922,Barrington Levy,Warm And Sunny Day
TRSTBUY128F4251203,1922,Barrington Levy,Mandela You're Free


In [7]:
unique_tracks.head()

Unnamed: 0_level_0,songID,artist,title
trackID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


In [16]:
unique_tracks.artist.unique().shape

(72665,)

In [8]:
unique_artists.head()

Unnamed: 0_level_0,artistMID,randomTrack,name
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,TRMUOZE12903CDF721,The Bristols
AR003FB1187B994355,1dbd2d7b-64c8-46aa-9f47-ff589096d672,TRWDPFR128F93594A6,The Feds
AR006821187FB5192B,94fc1228-7032-4fe6-a485-e122e5fbee65,TRMZLJF128F4269EAC,Stephen Varcoe/Choir of King's College_ Cambri...
AR009211187B989185,9dfe78a6-6d91-454e-9b95-9d7722cbc476,TRMGURO12903CAE2F0,Carroll Thompson
AR009SZ1187B9A73F4,8cd574c0-b9f7-4998-94f4-654dffaecdf2,TRGWWFP12903CE7E79,Gorodisch


In [9]:
unique_artists.head()

Unnamed: 0_level_0,artistMID,randomTrack,name
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,TRMUOZE12903CDF721,The Bristols
AR003FB1187B994355,1dbd2d7b-64c8-46aa-9f47-ff589096d672,TRWDPFR128F93594A6,The Feds
AR006821187FB5192B,94fc1228-7032-4fe6-a485-e122e5fbee65,TRMZLJF128F4269EAC,Stephen Varcoe/Choir of King's College_ Cambri...
AR009211187B989185,9dfe78a6-6d91-454e-9b95-9d7722cbc476,TRMGURO12903CAE2F0,Carroll Thompson
AR009SZ1187B9A73F4,8cd574c0-b9f7-4998-94f4-654dffaecdf2,TRGWWFP12903CE7E79,Gorodisch


In [10]:
artist_location.head()

Unnamed: 0_level_0,lat,long,name,location
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ARZGXZG1187B9B56B6,-16.96595,-61.14804,Endless Blue,Santa Cruz
AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN"
ARHJJ771187FB5B581,51.59678,-0.33556,Screaming Lord Sutch,"Harrow, Middlesex, England"
ARJ8YLL1187FB3CA93,40.69626,-73.83301,Morton Gould,"Richmond Hill, NY"
ARYBAGV11ECC836DAC,43.58828,-79.64372,Crash Parallel,Mississauga


In [11]:
def read_shs_files(pathToFile):
    f = open(pathToFile)
    s = StringIO()
    cur_ID = None
    for ln in f:
        if not ln.strip():
                continue
        if ln.startswith('%'):
                cur_ID = ln.replace('\n','<SEP>',1)
                continue
        if cur_ID is None:
                print ('NO ID found')
                sys.exit(1)
        s.write(cur_ID + ln)
    s.seek(0)
    df = pd.read_csv(s,delimiter='<SEP>',engine='python',header=None,names=['shsID','trackID','artistID','shsPerf'])
    return df

In [12]:
data=read_shs_files('data/SHS_testset.txt')

In [13]:
data.shsID=data.shsID.str.strip('%')
#data = data.set_index(['shsID'], inplace=True)

In [14]:
data

Unnamed: 0,shsID,trackID,artistID,shsPerf
0,"115402,74782, Putty (In Your Hands)",TRJVDMI128F4281B99,AR46LG01187B98DB5D,74784
1,"115402,74782, Putty (In Your Hands)",TRNJXCO128F92E1930,ARQD13K1187B98E441,138584
2,"24350, I.G.Y. (Album Version)",TRIBOIS128F9340B19,ARUVZYG1187B9B2809,24350
3,"24350, I.G.Y. (Album Version)",TRGXZDU128F9301E53,AR4LE591187FB3FCFB,24363
4,"79178, When The Catfish Is In Bloom",TRQSIOY128F92FACA7,ARU75JD1187FB38B79,79178
5,"79178, When The Catfish Is In Bloom",TRXEFPP128F92F9D97,ARZDN5A1187B99338B,79186
6,"11012, Sheer Heart Attack",TRXIXMS128F4287B96,ARL4TII1187B9B46E1,11012
7,"11012, Sheer Heart Attack",TRABVTG128F934AB80,AR9BVRM1187FB51139,97131
8,"11012, Sheer Heart Attack",TRRZZZZ128F422F784,ARNFBNR1187B9A25C2,-1
9,"5262, Calling Dr. Love",TRNBZMV128F42B261C,ARGCVZG1187B9B916F,26300
