# PROJECT - *My Way* of seeing music covers
#### Pierre-Antoine Desplaces, Anaïs Ladoy, Lou Richard

In [89]:
# Import libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from io import StringIO
import sys
import requests
from bs4 import BeautifulSoup
import pickle

## Data organisation 
- All the additional files were downloaded from the cluster giving all the metadata of the Million Songs dataset. They will help to elaborate a plan and a script will then search more information about a specific track (h5 files in the cluster) maybe using cluster cpu. The path to access to a track in the cluster is for example million-songs/data/A/A/A (with the 3 letters at the end being the 3rd, 4th and 5th letter on the track id).
- The music covers will be detected using another dataset (SecondHandSongs), we have the choice to use the downloadable dataset containing 18,196 tracks (all with a connection to the MSD dataset), or to web-scrapp the SHS website (https://secondhandsongs.com/) where we have much more information (522 436 covers) but not necessarly connected to our MSD dataset. The SHS API is RESTful (return a JSON object) and we are limited to 100 requests per minute and 1000 requestion per hour but we can contact them to remove limitation.
- Some artist are geolocalised (30% of the MSD total artists) on the artist_location dataframe.

In [32]:
#Load Additional files
tracks_per_year=pd.read_csv('data/AdditionalFiles/tracks_per_year.txt',delimiter='<SEP>',engine='python',header=None,index_col=1,names=['year','trackID','artist','title'])
unique_tracks=pd.read_csv('data/AdditionalFiles/unique_tracks.txt',delimiter='<SEP>',engine='python',header=None,index_col=0,names=['trackID','songID','artist','title'])
unique_artists=pd.read_csv('data/AdditionalFiles/unique_artists.txt',delimiter='<SEP>',engine='python',header=None,index_col=0,names=['artistID','artistMID','randomTrack','name'])
artist_location=pd.read_csv('data/AdditionalFiles/artist_location.txt',delimiter='<SEP>',engine='python',header=None,index_col=0,names=['artistID','lat','long','name','location'])

In [33]:
#Check if indexes is unique and print the number of elements for each dataframe
print('Dataframe (Unique index, Number of elements)')
print('tracks_per_year ',(tracks_per_year.index.is_unique,tracks_per_year.shape[0]))
print('unique_tracks ',(unique_tracks.index.is_unique,unique_tracks.shape[0]))
print('unique_artists ',(unique_artists.index.is_unique,unique_artists.shape[0]))
print('artist_location ',(artist_location.index.is_unique,artist_location.shape[0]))

Dataframe (Unique index, Number of elements)
tracks_per_year  (True, 515576)
unique_tracks  (True, 1000000)
unique_artists  (True, 44745)
artist_location  (True, 13850)


In [34]:
tracks_per_year.head()

Unnamed: 0_level_0,year,artist,title
trackID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TRSGHLU128F421DF83,1922,Alberta Hunter,Don't Pan Me
TRMYDFV128F42511FC,1922,Barrington Levy,Warm And Sunny Day
TRRAHXQ128F42511FF,1922,Barrington Levy,Looking My Love
TRFAFTK12903CC77B8,1922,Barrington Levy,Warm And Sunny Day
TRSTBUY128F4251203,1922,Barrington Levy,Mandela You're Free


In [35]:
unique_tracks.head()

Unnamed: 0_level_0,songID,artist,title
trackID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


In [36]:
unique_tracks.artist.unique().shape

(72665,)

In [37]:
unique_artists.head()

Unnamed: 0_level_0,artistMID,randomTrack,name
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,TRMUOZE12903CDF721,The Bristols
AR003FB1187B994355,1dbd2d7b-64c8-46aa-9f47-ff589096d672,TRWDPFR128F93594A6,The Feds
AR006821187FB5192B,94fc1228-7032-4fe6-a485-e122e5fbee65,TRMZLJF128F4269EAC,Stephen Varcoe/Choir of King's College_ Cambri...
AR009211187B989185,9dfe78a6-6d91-454e-9b95-9d7722cbc476,TRMGURO12903CAE2F0,Carroll Thompson
AR009SZ1187B9A73F4,8cd574c0-b9f7-4998-94f4-654dffaecdf2,TRGWWFP12903CE7E79,Gorodisch


In [38]:
unique_artists.head()

Unnamed: 0_level_0,artistMID,randomTrack,name
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,TRMUOZE12903CDF721,The Bristols
AR003FB1187B994355,1dbd2d7b-64c8-46aa-9f47-ff589096d672,TRWDPFR128F93594A6,The Feds
AR006821187FB5192B,94fc1228-7032-4fe6-a485-e122e5fbee65,TRMZLJF128F4269EAC,Stephen Varcoe/Choir of King's College_ Cambri...
AR009211187B989185,9dfe78a6-6d91-454e-9b95-9d7722cbc476,TRMGURO12903CAE2F0,Carroll Thompson
AR009SZ1187B9A73F4,8cd574c0-b9f7-4998-94f4-654dffaecdf2,TRGWWFP12903CE7E79,Gorodisch


In [39]:
artist_location.head()

Unnamed: 0_level_0,lat,long,name,location
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ARZGXZG1187B9B56B6,-16.96595,-61.14804,Endless Blue,Santa Cruz
AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN"
ARHJJ771187FB5B581,51.59678,-0.33556,Screaming Lord Sutch,"Harrow, Middlesex, England"
ARJ8YLL1187FB3CA93,40.69626,-73.83301,Morton Gould,"Richmond Hill, NY"
ARYBAGV11ECC836DAC,43.58828,-79.64372,Crash Parallel,Mississauga


In [40]:
def read_shs_files(pathToFile):
    f = open(pathToFile)
    s = StringIO()
    cur_ID = None
    for ln in f:
        if not ln.strip():
                continue
        if ln.startswith('%'):
                cur_ID = ln.replace('\n','<SEP>',1)
                continue
        if cur_ID is None:
                print ('NO ID found')
                sys.exit(1)
        s.write(cur_ID + ln)
    s.seek(0)
    df = pd.read_csv(s,delimiter='<SEP>',engine='python',header=None,names=['shsID','trackID','artistID','shsPerf'])
    return df

In [41]:
#Import the two SHS datasets (SHS data splitted in a train and test set to use for ML if wanted)
SHS_testset=read_shs_files('data/SHS_testset.txt')
SHS_trainset=read_shs_files('data/SHS_trainset.txt')
covers=pd.concat([SHS_testset,SHS_trainset])
covers.shape

(18196, 4)

In [42]:
covers.shsID=covers.shsID.str.strip('%')

In [43]:
covers.head()

Unnamed: 0,shsID,trackID,artistID,shsPerf
0,"115402,74782, Putty (In Your Hands)",TRJVDMI128F4281B99,AR46LG01187B98DB5D,74784
1,"115402,74782, Putty (In Your Hands)",TRNJXCO128F92E1930,ARQD13K1187B98E441,138584
2,"24350, I.G.Y. (Album Version)",TRIBOIS128F9340B19,ARUVZYG1187B9B2809,24350
3,"24350, I.G.Y. (Album Version)",TRGXZDU128F9301E53,AR4LE591187FB3FCFB,24363
4,"79178, When The Catfish Is In Bloom",TRQSIOY128F92FACA7,ARU75JD1187FB38B79,79178


In [44]:
#Convert shsID to clique id (first convert to category and get a code)
covers=covers.assign(clique_id=(covers.shsID.astype('category')).cat.codes)
#Remove the shsID and the shsPerf columns (useless)
covers.drop('shsID',axis=1,inplace=True)
#covers.drop('shsPerf',axis=1,inplace=True)
#Merge with unique_artists dataframe to find the artist name for each track (no taking consideration of featuring since we take only the name of the artist assigned with the track)
covers=covers.merge(unique_artists[['name']],how='left',left_on='artistID',right_index=True)
#Take the clique id we defined as id of the dataframe (not unique index for now)
#covers.set_index('id',inplace=True)
#covers.sort_index(inplace=True)
#Merge with unique_tracks dataframe to find the track name
covers=covers.merge(unique_tracks[['title']],how='left',left_on='trackID',right_index=True)
#Merge with tracks_per_year dataframe to find the year of each track
covers=covers.merge(tracks_per_year[['year']],how='left',left_on='trackID',right_index=True)

In [45]:
covers.head()

Unnamed: 0,trackID,artistID,shsPerf,clique_id,name,title,year
0,TRJVDMI128F4281B99,AR46LG01187B98DB5D,74784,1433,The Detroit Cobras,Putty (In Your Hands),1998.0
1,TRNJXCO128F92E1930,ARQD13K1187B98E441,138584,1433,Sylvie Vartan,Ne Le Déçois Pas,1962.0
2,TRIBOIS128F9340B19,ARUVZYG1187B9B2809,24350,2543,Donald Fagen,I.G.Y. (Album Version),1982.0
3,TRGXZDU128F9301E53,AR4LE591187FB3FCFB,24363,2543,Take 6,Beautiful World (Album Version),
4,TRQSIOY128F92FACA7,ARU75JD1187FB38B79,79178,5240,John Fahey,When The Catfish Is In Bloom,1968.0


In [46]:
print('Number of cliques :', max(covers.index)+1) #Number of cliques (+1 because id starts at 0)
print('Number of unique tracks :', len(covers.trackID.unique())) 
print('Number of unique artists :', len(covers.artistID.unique()))
print('Number of missing trackID :', len(covers[covers.trackID.isnull()]))
print('Number of missing artistID :', len(covers[covers.artistID.isnull()]))
print('Number of missing years :', len(covers[covers.year.isnull()]))

Number of cliques : 12960
Number of unique tracks : 18196
Number of unique artists : 5578
Number of missing trackID : 0
Number of missing artistID : 0
Number of missing years : 4796


In [47]:
covers=covers.sort_values(['clique_id', 'year'], ascending=[True, True]).reset_index()

In [48]:
covers.tail()

Unnamed: 0,index,trackID,artistID,shsPerf,clique_id,name,title,year
18191,9538,TRUZAVE128F426E391,AR3DLBB1187B98F3DF,106651,5852,The Statler Brothers,Oh Happy Day,1993.0
18192,9540,TRDNIRY128F425A5C8,ARDRJSP126E2B3BEF8,-1,5852,Spiritualized,Oh Happy Day,1998.0
18193,9539,TRPVDKP128F934956E,ARELPXQ1187FB384FD,138704,5852,Queen Latifah,Oh Happy Day,
18194,5668,TREMVLC128F92EFA95,ARBFDJW1187B9AD27A,29576,5853,The Fabulous Thunderbirds,Tip On In,1987.0
18195,5667,TRMTMWV128F92F6F78,ARDYR3C1187FB461CE,9986,5853,George Thorogood And The Destroyers,Tip On In,


In [49]:
covers[(covers.year.isnull()) & (covers.shsPerf <0)]

Unnamed: 0,index,trackID,artistID,shsPerf,clique_id,name,title,year
3,11504,TRKOINL128F42926C3,ARQ5FSZ1187B98AD74,-1,0,Connee Boswell & Sy Oliver Orchestra,Star Dust,
5,12946,TRYQEDQ128F427917C,ARS4KT21187B9B9438,-1,1,Fito Paez,Yo Vengo A Ofrecer Mi Corazon,
13,12813,TRLLCAL128F428B903,ARP2RHS1187B991595,-1,5,Zen Guerrilla,Moonage Daydream,
14,12810,TREVWUZ128F4263A9B,AR9UYPT1187B9AE833,-1,6,Hear'Say,Monday Monday,
15,12811,TRGYREY128E0791913,ARQ294N1187FB53D2A,-1,6,The Mamas & The Papas,Monday_ Monday,
22,12805,TRQPBQV128F423C1E9,AREH7LP1187B9B7164,-1,9,War,Low Rider,
23,12801,TRVXQXV128F427863A,ARTO3L81187B9ACA8C,-1,10,Jeff Buckley,Lover_ You Should've Come Over,
24,12802,TRZJWSX12903CA89C1,ARC1IHZ1187FB4E920,-1,10,Jamie Cullum,Lover_ You Should Have Come Over,
26,5158,TRFWVIP128EF3444F3,ARE66XI1187B9B8743,-1,11,Ian McCulloch,Lover_ Lover_ Lover,
30,12799,TRJLBUB128F92E293E,ARN3SEQ1187FB47AA8,-1,13,Heart,Love Reign O'er Me,


In [50]:
print('Number of missing years with valid shsPerf (API request on the performance page) :',len(covers[(covers.year.isnull()) & (covers.shsPerf != -1)]))
print('Number of missing years with invalid shsPerf (API request on the search page to find shsPerf) :',len(covers[(covers.year.isnull())])-len(covers[(covers.year.isnull()) & (covers.shsPerf != -1)]))

Number of missing years with valid shsPerf (API request on the performance page) : 4128
Number of missing years with invalid shsPerf (API request on the search page to find shsPerf) : 668


We need to find the missing years in order to rank the cover songs for each clique and thus, find the original song and the following covers. Since year isn't necessarly sufficient informations to discriminate the songs (cover appears sometimes in the same year than the original one), it will be better to have the entire released date for ALL the tracks if the information is available in the SHS website.

Need the shsPerf to access to the song page in SHS website, where we can find informations about the language and the released date of the song. In the dataset, negative values of shsPerf are considered as missing values.

Two ways of doing it :
- For valid SHS performance ID, access to the performance page (e.g. 'https://secondhandsongs.com/performance/1983') and web-scrapping of the Language and Released date informations using the perfInfo() function.
- For invalid SHS performance ID, API request to the search page (e.g. 'https://secondhandsongs.com/search/performance?title=blackbird&performer=beatles'), extract the perf ID with the find_PerfID() and then use the perfInfo() function.

In [51]:
#Compute the order of songs for each clique
#covers['rank']=covers.groupby('clique_id')['year'].rank(method='dense',ascending=True).astype('int')

In [52]:
#covers.set_index('clique_id',inplace=True)
covers.drop('index',axis=1,inplace=True)

In [53]:
covers.tail(20)

Unnamed: 0,trackID,artistID,shsPerf,clique_id,name,title,year
18176,TRITRRU128F423DC76,ARJ59K61187FB5B43E,99662,5846,Afghan Whigs,Sammy (Album),1990.0
18177,TRRCCUQ128F42449F7,ARNUU2R1187B9A316C,99675,5846,Sounds Like Violence,Sammy,2009.0
18178,TRCEMLE128F4281036,ARJ59K61187FB5B43E,99668,5847,Afghan Whigs,Going To Town,1996.0
18179,TRCGGYF128F9338B93,ARYDZKJ1187FB540C6,99680,5847,Marshall_ Susan,Going To Town,2009.0
18180,TRAHFYD128F426BBAD,AR94ZOI1187FB46BDA,103980,5848,Willie Dixon,The Same Thing,1970.0
18181,TRRUFPY128F4261744,ARJWDFW1241B9C69AE,118487,5848,Todd Wolfe Blues Project,Same Thing,
18182,TRTFWRF128F92E378E,ARX1CYE1187FB3A97B,99730,5849,Bill Anderson,The Tip Of My Fingers,2002.0
18183,TRBXSHQ128F4278320,ARLM6I41187B9BA123,99731,5849,Roy Clark,The Tip Of My Fingers,
18184,TRGOMBU128F42895B9,AR14CJ91187FB3A994,99733,5849,Eddy Arnold,The Tip of My Fingers,
18185,TRMCRMX128F4267AC3,ARSPUHE1187B99DB8A,125607,5850,Frank Marino & Mahogany Rush,Norwegian Wood (This Bird Has Flown),


In [90]:
pickle.dump(covers,open('covers.p','wb'))

To handle :
- Find the missing years using API request to SHS website (if no informations about the year on the website, find another solution)
- Find a way to detect the original song if the first cover is the same year? (also the case where two covers are made during the same year and then have the same rank.. problem with multi-index after because not unique)
- API request to SHS website for the location and the language

In [54]:
#API request to SHS website for the page of a specific performance (defined as shsPerf) to extract Language and Date
def perfInfo_SHS(shsPerf):
    if shsPerf<0:
        perfLanguage='Unavailable'
        perfDate='Unavailable'
    elif shsPerf>0:
        r = requests.get('https://secondhandsongs.com/performance/'+str(shsPerf))
        print(r.status_code)
        soup = BeautifulSoup(r.text, 'html.parser')
        perfMeta=soup.find_all('dl')[0] 
        perfLanguage=perfMeta.find('dd',attrs={'itemprop':'inLanguage'})
        if perfLanguage is None :
            perfLanguage='Missing'
        else :
            perfLanguage=perfLanguage.text

        perfDate=perfMeta.find('div',attrs={'class':'media-body'})
        if perfDate is None :
            perfDate='Missing'
        else :
            perfDate=perfDate.find('p').text.split('\n')[2].strip(' ')

    return perfLanguage,perfDate

In [55]:
language=perfInfo_SHS(9986)[0]
date=perfInfo_SHS(9986)[1]

200
200


Web-scraping - Part of the dataframe each hour (1000 requests by hour)

In [77]:
#Work with a subset a the dataframe to create the algorithms
part=covers[2055:3017]
part.head()

Unnamed: 0,trackID,artistID,shsPerf,clique_id,name,title,year
2055,TRWJCBY128F4261C0F,AR11YQ81187FB3C654,100068,940,Dixie Chicks,Tonight The Heartache's On Me,1998.0
2056,TRSSNRG12903CC8518,ARHAL3V1187B9AA462,100066,940,Joy Lynn White,Tonight The Heartache's On Me,
2057,TRBWTHA128E0791A0B,ARLAUED1187B9ACEAF,30050,941,Eric Clapton,Willie And The Hand Jive,1974.0
2058,TRHFTUX128F93010D0,ARB5U6G1187B9A994C,10009,941,Johnny Otis Show,Willie And The Hand Jive,1984.0
2059,TRGDJUM12903CC5CD9,ARRHVVL1187B991E41,-1,941,Johnny Otis,Willie And The Hand Jive,1991.0


In [78]:
print('Number of cliques in the subset :', len(part.clique_id.unique()))
print('Number of tracks in the subset :', part.shape[0])
print('Number of missing years in the subset :', len(part[part.year.isnull()]))
print('Number of invalid shsPerf in the subset :', len(part[part.shsPerf<0]))

Number of cliques in the subset : 298
Number of tracks in the subset : 962
Number of missing years in the subset : 269
Number of invalid shsPerf in the subset : 29


In [None]:
part['language'], \
part['date'] = zip(*part.shsPerf.map(perfInfo_SHS))

In [None]:
part.to_csv('data/partSHS_webscraping/part_0_800.csv')

In [81]:
r = requests.get('https://secondhandsongs.com/performance/30050')
print(r.status_code)
soup = BeautifulSoup(r.text, 'html.parser')
soup.body

200


<body class="">
<div class="loading">
<div class="loader"></div>
</div>
<input name="_ds" type="hidden" value="0"/>
<section id="header">
<header class="container">
<div class="row"> <div class="col-lg-4 col-sm-5 hidden-xs">
<a class="logo" href="/">
<img alt="Second Hand Songs - A Cover Songs Database" src="/art/logo.png"/>
</a>
</div>
<nav class="col-lg-8 col-sm-7" id="top-menu" role="navigation">
<ul class="nav navbar-nav navbar-right" id="top-menu-navbar-collapse">
<li class="visible-xs"><a class="logo" href="/"><img alt="Second Hand Songs - A Cover Songs Database" src="/art/logo.png"/></a>
</li>
<li><a href="/explore">Explore</a></li>
<li><a href="/forum">Discuss
                                                    </a></li>
<li>
<a href="/participate">Participate
                                                    </a>
</li>
<li>
<a href="/quiz">Play </a>
</li>
<li class="accent"><a href="/login?ref=%2Fperformance%2F30050">Sign
                                In</a></li>
<li class

In [87]:
b=soup.find('section',attrs={'class':'work-originals'})

In [88]:
b

<section class="work-originals">
<h2 class="section-title" id="t1">Originals</h2>
<article class="clearfix" itemprop="recordingOf" itemscope="" itemtype="http://schema.org/MusicComposition">
<h2 class="divider"><a class="link-work" href="/work/10009" itemprop="url"><span itemprop="name">Willie and the Hand Jive</span></a></h2>
<section class="row">
<section class="col-sm-3" itemprop="recordedAs" itemscope="" itemtype="http://schema.org/MusicRecording">
<div class="media">
<a class="pull-left" href="/artist/6808">
<img alt="The Johnny Otis Show" src="/picture/150382/tiny"/>
</a>
<div class="media-body">
<p><a href="/performance/10009" itemprop="url"><strong>First release</strong></a>
<em>by</em> <span itemprop="byArtist" itemscope="" itemtype="http://schema.org/MusicGroup"><a class="link-performer" href="/artist/6808" itemprop="url"><span itemprop="name">The Johnny Otis Show</span></a></span>
                                                                                            (Ap