# Project story
### Follow the evolution of "My way"

#### Plan
- 14 covers in our dataset and 289 on SHS website
- Web-scrapping of the covers we don't have
- Have at least Country/Year/Language/Artist/Title for each song


In [47]:
# Import libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from io import StringIO
import sys
import requests
from bs4 import BeautifulSoup
import pickle
import os
import glob
from pathlib import Path

In [None]:
# Clique for "My Way"
covers.loc[1190]

In [48]:
r=requests.get('https://secondhandsongs.com/work/4250/versions#nav-entity')
soup = BeautifulSoup(r.text, 'html.parser')


In [49]:
len(soup.find_all('table'))

27

In [50]:
soup.find_all('table')[0].find('tbody').find_all('tr')[15]

<tr itemprop="recordedAs" itemscope="" itemtype="http://schema.org/MusicRecording">
<td class="field-icon"></td>
<td class="field-title"><a class="link-performance" href="/performance/491567/versions" itemprop="url"><span itemprop="name">Comme d'habitude</span></a></td>
<td class="field-performer"><span itemprop="byArtist" itemscope="" itemtype="http://schema.org/MusicGroup"><a class="link-performer" href="/artist/26302" itemprop="url"><span itemprop="name">M. Pokora</span></a></span></td>
<td class="field-date">October 21, 2016</td>
<td class="field-info"></td>
</tr>

In [51]:
soup.find_all('table')[17].find('tbody').find_all('tr')[0].find('td',attrs={'class':'field-performer'}).find('a')['href'].split('/')[2]

'41140'

In [52]:
# Functions to search for each track "My Way" informations about its language, its released date and artist informations
def myWay_versions(table_id) :
    
    songs_table=[]
    nb_elems=len(soup.find_all('table')[table_id].find('tbody').find_all('tr')) 
    
    for i in range(0,nb_elems) : # Go through all the tracks contained in the table
        elem_sect=soup.find_all('table')[table_id].find('tbody').find_all('tr')[i]
        song_sect=elem_sect.find('td',attrs={'class':'field-title'})
        artist_sect=elem_sect.find('td',attrs={'class':'field-performer'})
        info_sect=elem_sect.find('td',attrs={'class':'field-info'})
        
        if info_sect.text=='Submitted' :
            status='case'
        else :
            status='performance'
    
        artist_id=artist_sect.find('a')
        
        if artist_id is None :
            artist_id='Missing'
        else :
            artist_id=artist_id['href'].split('/')[2]
            
            if artist_id.find('+')!=-1 : #For featuring (several artist_id) keep first one
                artist_id=int(artist_id.split('+')[0])
            else :
                artist_id=int(artist_id)
        
        artist_name=artist_sect.text
        song_id=int(song_sect.find('a')['href'].split('/')[2])
        song_name=song_sect.text
        perfLanguage, perfDate, artist_nationality = get_info(song_id,artist_id,status)
        songs_table.append([artist_name, song_name, perfLanguage, perfDate, artist_nationality])
    
    return songs_table
    
    
def get_info(song_id,artist_id,status) :     
# Web-scrapping of performance page (language and released year of the track)
    song_request = requests.get('https://secondhandsongs.com/'+status+'/'+str(song_id)) # Access to the song page on SHS

    song_soup = BeautifulSoup(song_request.text, 'html.parser')
    perfMeta=song_soup.find('dl',attrs={'class':'dl-horizontal'})

    if perfMeta is None :
        perfLanguage='Missing'
        perfDate='Missing'
    else :
        # Extract language
        perfLanguage=perfMeta.find('dd',attrs={'itemprop':'inLanguage'})
        if perfLanguage is None :
            
            if perfMeta.find_all('dd') is None :
                perfLanguage='Missing'
            else :
                perfLanguage1=perfMeta.find_all('dd')[3].text.replace(" ","").strip('\n').rstrip('\n')
                perfLanguage2=perfMeta.find_all('dd')[2].text.replace(" ","").strip('\n').rstrip('\n')
                
                if (perfLanguage1=='') & (perfLanguage2 !='') :
                    perfLanguage=perfLanguage2
                elif (perfLanguage2=='') & (perfLanguage1 !='') :
                    perfLanguage=perfLanguage1
                else :
                    perfLanguage='Missing'
                
            
        else :
            perfLanguage=perfLanguage.text

        # Extract released date    
        perfDate=perfMeta.find('div',attrs={'class':'media-body'})
        if perfDate is None :
            
            if perfMeta.find_all('dd') is None :
                perfDate='Missing'
            else :
                perfDate=perfMeta.find_all('dd')[6].text
            
        else :
            perfDate=perfDate.find('p').text.split('\n')[2].strip(' ')

# Web-scrapping of performance page (language and released year of the track)

    if artist_id=='Missing':
        artist_nationality='Missing'
    else :
        
        artist_request = requests.get('https://secondhandsongs.com/artist/'+str(artist_id)) # Access to the song page on SHS
        artist_soup = BeautifulSoup(artist_request.text, 'html.parser')
        artist_nationality=artist_soup.find('dl',attrs={'class':'dl-horizontal'}).find('dd',attrs={'itemprop':'nationality'})

        if artist_nationality is None :
            artist_nationality='Missing'
        else :
            artist_nationality=artist_nationality.text.split(' ')[-1].rstrip('\n')

    return perfLanguage, perfDate, artist_nationality
    

In [53]:
myWay=pd.DataFrame()

In [54]:
#for i in range(0,len(soup.find_all('table'))):
#    myWay=myWay.append(myWay_versions(i))

In [75]:
#pickle.dump(myWay,open('data/song_story.p','wb'))
myWay=pickle.load(open('song_story.p','rb'))

In [71]:
# Rename unvalid country (cut with algorithm)
myWay.replace(['Kingdom','States'],['United Kingdom','United States'],inplace=True)

In [72]:
# Rename columns
myWay.columns=['artist','title','language','date','location']
myWay.reset_index(drop=True,inplace=True)

In [73]:
myWay['year']=myWay.date.str[-4:]
myWay.drop('date',axis=1,inplace=True)

In [78]:
myWay.head()

Unnamed: 0,artist,title,language,location,year
0,Claude François,Comme d'habitude,French,France,1967
1,Michel Pagliaro,Comme d'habitude,French,Canada,1968
2,Michel Sardou,Comme d'habitude,French,France,1978
3,Hervé Vilard,Comme d'habitude,French,France,1984
4,Mireille Mathieu,Comme d'habitude,French,France,1985


In [79]:
print('Number of missing artists : ', len(myWay[myWay.artist=='Missing']))
print('Number of missing titles : ', len(myWay[myWay.title=='Missing']))
print('Number of missing languages : ', len(myWay[myWay.language=='Missing']))
print('Number of missing years : ', len(myWay[myWay.year=='Missing']))
print('Number of missing locations : ', len(myWay[myWay.location=='Missing']))

Number of missing artists :  0
Number of missing titles :  0
Number of missing languages :  0
Number of missing years :  0
Number of missing locations :  0


In [80]:
myWay[myWay.language=='Missing']

Unnamed: 0,artist,title,language,location,year


In [62]:
myWay.loc[28, 'language'] = 'Instrumental'

In [63]:
#Remove weird covers
myWay.drop(157,axis=0,inplace=True)

In [64]:
#Remove [1] in artist names
myWay.replace('[1]','',inplace=True)

In [None]:
myWay.loc[[5,180],'location'] = 'Italy'
myWay.loc[[9,236],'location'] = 'Switzerland'
myWay.loc[[11],'location'] = 'Belgium'
myWay.loc[[25,57,145,153],'location'] = 'United Kingdom'
myWay.loc[[40,277],'location'] = 'France'
myWay.loc[[60],'location'] = 'Poland'
myWay.loc[[70],'location'] = 'Canada'
myWay.loc[[72,188,221,250,278],'location'] = 'United States'
myWay.loc[[72],'artist'] = 'Francky Perez'
myWay.loc[[115,247],'location'] ='Czech Republic'
myWay.loc[[156],'location'] ='Serbia'
myWay.loc[[156],'artist']='Boba Stefanović'
myWay.loc[[174,248],'location'] ='Spain'
myWay.loc[[191],'location'] ='Brazil'
myWay.loc[[208],'location'] ='Australia'
myWay.loc[[218],'location'] ='Hungary'
myWay.loc[[265,270,286],'location'] ='Germany'
myWay.loc[[275],'location'] ='Russia'
myWay.loc[[288],'location'] ='Austria'

In [82]:
myWay.drop(myWay[myWay.location=='Missing'].index,axis=0,inplace=True)

In [83]:
myWay[myWay.location=='Missing']

Unnamed: 0,artist,title,language,location,year


In [138]:
#pickle.dump(myWay,open('song_story.p','wb'))
myWay=pickle.load(open('song_story.p','rb'))

In [171]:
myWay.sort_values('year').tail()

Unnamed: 0_level_0,artist,title,language,location,year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
244,Ben L'Oncle Soul,My Way,English,France,2016
245,Jim Armstrong,My Way,English,United Kingdom,2016
246,Seth MacFarlane,My Way,English,United States,2016
256,Masa Mainds,Minun tieni,Finnish,Finland,2016
15,M. Pokora,Comme d'habitude,French,France,2016


In [140]:
myWay.loc[[65,68,258,250],'location'] = 'Croatia'
myWay.loc[[205,212,233,225],'location'] = 'New Zealand'

In [141]:
myWay.loc[205]

artist        Ray Quinn
title            My Way
language        English
location    New Zealand
year               2007
Name: 205, dtype: object

In [156]:
myWay[['title','year']].groupby('year').agg('count').head()

Unnamed: 0_level_0,title
year,Unnamed: 1_level_1
1967,1
1968,2
1969,21
1970,20
1971,15


In [157]:
myWay[myWay.location=='Zealand']

Unnamed: 0_level_0,artist,title,language,location,year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [158]:
#http://gothos.info/resources/
country_info=pd.read_csv('country_centroids_all.csv',sep='\t')

In [159]:
country_info=country_info[['LAT','LONG','SHORT_NAME']]

In [160]:
country_info.head()

Unnamed: 0,LAT,LONG,SHORT_NAME
0,33.0,66.0,Afghanistan
1,41.0,20.0,Albania
2,28.0,3.0,Algeria
3,-14.333333,-170.0,American Samoa
4,42.5,1.5,Andorra


In [161]:
country_info[country_info.SHORT_NAME.str.contains('Zealand')]

Unnamed: 0,LAT,LONG,SHORT_NAME
163,-42.0,174.0,New Zealand


In [162]:
merged_df=myWay.merge(country_info, how='left',left_on='location',right_on='SHORT_NAME')

In [163]:
merged_df[merged_df.SHORT_NAME.isnull()]

Unnamed: 0,artist,title,language,location,year,LAT,LONG,SHORT_NAME


In [168]:
merged_df.index.rename('index',inplace=True)

In [169]:
pickle.dump(merged_df,open('merged_centroids.p','wb'))

In [170]:
merged_df.to_csv('song_story.csv')