# PROJECT - *My Way* of seeing music covers
#### Pierre-Antoine Desplaces, Anaïs Ladoy, Lou Richard

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from io import StringIO
import sys
import requests
from bs4 import BeautifulSoup

## Data organisation 
- All the additional files were downloaded from the cluster giving all the metadata of the Million Songs dataset. They will help to elaborate a plan and a script will then search more information about a specific track (h5 files in the cluster) maybe using cluster cpu. The path to access to a track in the cluster is for example million-songs/data/A/A/A (with the 3 letters at the end being the 3rd, 4th and 5th letter on the track id).
- The music covers will be detected using another dataset (SecondHandSongs), we have the choice to use the downloadable dataset containing 18,196 tracks (all with a connection to the MSD dataset), or to web-scrapp the SHS website (https://secondhandsongs.com/) where we have much more information (522 436 covers) but not necessarly connected to our MSD dataset. The SHS API is RESTful (return a JSON object) and we are limited to 100 requests per minute and 1000 requestion per hour but we can contact them to remove limitation.
- Some artist are geolocalised (30% of the MSD total artists) on the artist_location dataframe.

In [None]:
#Load Additional files
tracks_per_year=pd.read_csv('data/AdditionalFiles/tracks_per_year.txt',delimiter='<SEP>',engine='python',header=None,index_col=1,names=['year','trackID','artist','title'])
unique_tracks=pd.read_csv('data/AdditionalFiles/unique_tracks.txt',delimiter='<SEP>',engine='python',header=None,index_col=0,names=['trackID','songID','artist','title'])
unique_artists=pd.read_csv('data/AdditionalFiles/unique_artists.txt',delimiter='<SEP>',engine='python',header=None,index_col=0,names=['artistID','artistMID','randomTrack','name'])
artist_location=pd.read_csv('data/AdditionalFiles/artist_location.txt',delimiter='<SEP>',engine='python',header=None,index_col=0,names=['artistID','lat','long','name','location'])

In [None]:
#Check if indexes is unique and print the number of elements for each dataframe
print('Dataframe (Unique index, Number of elements)')
print('tracks_per_year ',(tracks_per_year.index.is_unique,tracks_per_year.shape[0]))
print('unique_tracks ',(unique_tracks.index.is_unique,unique_tracks.shape[0]))
print('unique_artists ',(unique_artists.index.is_unique,unique_artists.shape[0]))
print('artist_location ',(artist_location.index.is_unique,artist_location.shape[0]))

In [None]:
tracks_per_year.head()

In [None]:
unique_tracks.head()

In [None]:
unique_tracks.artist.unique().shape

In [None]:
unique_artists.head()

In [None]:
unique_artists.head()

In [None]:
artist_location.head()

In [None]:
def read_shs_files(pathToFile):
    f = open(pathToFile)
    s = StringIO()
    cur_ID = None
    for ln in f:
        if not ln.strip():
                continue
        if ln.startswith('%'):
                cur_ID = ln.replace('\n','<SEP>',1)
                continue
        if cur_ID is None:
                print ('NO ID found')
                sys.exit(1)
        s.write(cur_ID + ln)
    s.seek(0)
    df = pd.read_csv(s,delimiter='<SEP>',engine='python',header=None,names=['shsID','trackID','artistID','shsPerf'])
    return df

In [None]:
#Import the two SHS datasets (SHS data splitted in a train and test set to use for ML if wanted)
SHS_testset=read_shs_files('data/SHS_testset.txt')
SHS_trainset=read_shs_files('data/SHS_trainset.txt')
covers=pd.concat([SHS_testset,SHS_trainset])
covers.shape

In [None]:
covers.shsID=covers.shsID.str.strip('%')

In [None]:
covers.head()

In [None]:
#Convert shsID to clique id (first convert to category and get a code)
covers=covers.assign(clique_id=(covers.shsID.astype('category')).cat.codes)
#Remove the shsID and the shsPerf columns (useless)
covers.drop('shsID',axis=1,inplace=True)
#covers.drop('shsPerf',axis=1,inplace=True)
#Merge with unique_artists dataframe to find the artist name for each track (no taking consideration of featuring since we take only the name of the artist assigned with the track)
covers=covers.merge(unique_artists[['name']],how='left',left_on='artistID',right_index=True)
#Take the clique id we defined as id of the dataframe (not unique index for now)
#covers.set_index('id',inplace=True)
#covers.sort_index(inplace=True)
#Merge with unique_tracks dataframe to find the track name
covers=covers.merge(unique_tracks[['title']],how='left',left_on='trackID',right_index=True)
#Merge with tracks_per_year dataframe to find the year of each track
covers=covers.merge(tracks_per_year[['year']],how='left',left_on='trackID',right_index=True)

In [None]:
covers.head()

In [None]:
print('Number of cliques :', max(covers.index)+1) #Number of cliques (+1 because id starts at 0)
print('Number of unique tracks :', len(covers.trackID.unique())) 
print('Number of unique artists :', len(covers.artistID.unique()))
print('Number of missing trackID :', len(covers[covers.trackID.isnull()]))
print('Number of missing artistID :', len(covers[covers.artistID.isnull()]))
print('Number of missing years :', len(covers[covers.year.isnull()]))

In [None]:
covers=covers.sort_values(['clique_id', 'year'], ascending=[True, True]).reset_index()

In [None]:
covers.tail()

In [None]:
covers[(covers.year.isnull()) & (covers.shsPerf <0)]

In [None]:
print('Number of missing years with valid shsPerf (API request on the performance page) :',len(covers[(covers.year.isnull()) & (covers.shsPerf != -1)]))
print('Number of missing years with invalid shsPerf (API request on the search page to find shsPerf) :',len(covers[(covers.year.isnull())])-len(covers[(covers.year.isnull()) & (covers.shsPerf != -1)]))

We need to find the missing years in order to rank the cover songs for each clique and thus, find the original song and the following covers. Since year isn't necessarly sufficient informations to discriminate the songs (cover appears sometimes in the same year than the original one), it will be better to have the entire released date for ALL the tracks if the information is available in the SHS website.

Need the shsPerf to access to the song page in SHS website, where we can find informations about the language and the released date of the song. In the dataset, negative values of shsPerf are considered as missing values.

Two ways of doing it :
- For valid SHS performance ID, access to the performance page (e.g. 'https://secondhandsongs.com/performance/1983') and web-scrapping of the Language and Released date informations using the perfInfo() function.
- For invalid SHS performance ID, API request to the search page (e.g. 'https://secondhandsongs.com/search/performance?title=blackbird&performer=beatles'), extract the perf ID with the find_PerfID() and then use the perfInfo() function.

In [None]:
#Compute the order of songs for each clique
#covers['rank']=covers.groupby('clique_id')['year'].rank(method='dense',ascending=True).astype('int')

In [None]:
#covers.set_index('clique_id',inplace=True)
#covers.drop('index',axis=1,inplace=True)

In [None]:
covers.head()

To handle :
- Find the missing years using API request to SHS website (if no informations about the year on the website, find another solution)
- Find a way to detect the original song if the first cover is the same year? (also the case where two covers are made during the same year and then have the same rank.. problem with multi-index after because not unique)
- API request to SHS website for the location and the language

In [None]:
#API request to SHS website for the page of a specific performance (defined as shsPerf) to extract Language and Date
def perfInfo_SHS(shsPerf):
    if shsPerf<0:
        perfLanguage='Unavailable'
        perfDate='Unavailable'
    elif shsPerf>0:
        r = requests.get('https://secondhandsongs.com/performance/'+str(shsPerf))
        print(r.status_code)
        soup = BeautifulSoup(r.text, 'html.parser')
        perfMeta=soup.find_all('dl')[0] 
        perfLanguage=perfMeta.find('dd',attrs={'itemprop':'inLanguage'})
        if perfLanguage is None :
            perfLanguage='Missing'
        else :
            perfLanguage=perfLanguage.text

        perfDate=perfMeta.find('div',attrs={'class':'media-body'})
        if perfDate is None :
            perfDate='Missing'
        else :
            perfDate=perfDate.find('p').text.split('\n')[2].strip(' ')

    return perfLanguage,perfDate

In [None]:
#language=perfInfo_SHS(-1)[0]
#date=perfInfo_SHS(covers.iloc[18193].shsPerf)[1]

In [None]:
test=covers.sample(n=20)

In [None]:
#test['language'], \
#test['date'] = zip(*test.shsPerf.map(perfInfo_SHS))

In [None]:
test

Web-scraping - Part of the dataframe each hour (1000 requests by hour)

In [None]:
part=covers[0:800]

In [None]:
#part['language'], \
#part['date'] = zip(*part.shsPerf.map(perfInfo_SHS))

In [None]:
part.to_csv('data/partSHS_webscraping/part_0_800.csv')