In [1]:
#importing the neccessary libraries we will need to run our data and analysis

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import sys
import csv

from googletrans import Translator 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
import json
from pprint import pprint 

import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.oauth2 as oauth2

from config import lastfm_api_key
from config import CLIENT_ID, CLIENT_SECRET

In [2]:
#list of countries we narrow down to sort through our data

countries = ['hk', 'nl', 'au', 'ca', 'fr', 'jp', 'gb', 'de', 'us' ]

In [3]:
#create a dataframe that takes in all the top 5 songs from each country on our list

song_list = []
artist_list = []
album_list = []
country_list = []
genre_list = []
rank = []

for country in countries:
    top_list = []
    music_url = f"https://rss.itunes.apple.com/api/v1/{country}/apple-music/top-songs/all/5/explicit.json"
    
    try:
        response = requests.get(music_url).json()
        top_list.append(response["feed"]["results"])
        country_df = pd.DataFrame(top_list)
        for x in range (0,5):
            rank.append(x+1)
            song_list.append(country_df[x][0]['name'])
            country_list.append(country)
            artist_list.append(country_df[x][0]['artistName'])
            album_list.append(country_df[x][0]["collectionName"])
            genre_list.append(country_df[x][0]["genres"][0]["name"])
        
    except:
        print(f"Can't find {country}. Skipping... ")
apple_top_df = pd.DataFrame({
    "Country" : country_list,
    "Name" : song_list,
    "Artist" : artist_list,
    "Album" : album_list,
    "Genre" : genre_list,
    "Rank" : rank
})

apple_top_df['Country'] = apple_top_df['Country'].replace({'hk': 'Hong Kong',
                           'nl': 'Netherlands',
                           'au': 'Australia',
                           'ca': 'Canada',
                           'fr': 'France',
                           'jp': 'Japan',
                           'gb': 'UK',
                           'de': 'Germany',
                           'us': 'US'
                          })
apple_top_df['Source'] = 'Apple'
apple_top_df.head()

Unnamed: 0,Country,Name,Artist,Album,Genre,Rank,Source
0,Hong Kong,Señorita,Shawn Mendes & Camila Cabello,Señorita - Single,流行樂,1,Apple
1,Hong Kong,Into the Unknown,Idina Menzel & AURORA,Frozen 2 (Original Motion Picture Soundtrack /...,原聲帶,2,Apple
2,Hong Kong,說好不哭,周杰倫 & 阿信,說好不哭 - Single,國語流行樂,3,Apple
3,Hong Kong,Show Yourself,Idina Menzel & Evan Rachel Wood,Frozen 2 (Original Motion Picture Soundtrack /...,原聲帶,4,Apple
4,Hong Kong,讓愛高飛 (劇集《多功能老婆》片尾曲),周柏豪,讓愛高飛 (劇集《多功能老婆》片尾曲) - Single,廣東歌,5,Apple


In [4]:
#cleaning the dataframe to be more readable 

translator = Translator()

apple_top_df['Genre'] = apple_top_df['Genre'].apply(translator.translate, dest='en').apply(getattr, args=('text',))
apple_top_df['Genre'] = apple_top_df['Genre'].replace({'pop music' : 'Pop', 
                                                                   'Mandarin pop music' : 'Pop',
                                                                   'Alternative Music' : 'Alternative',
                                                                   'R & B, soul' : 'R&B/Soul',
                                                                   'priest' : 'Priest'
                                                                })
apple_top_df

Unnamed: 0,Country,Name,Artist,Album,Genre,Rank,Source
0,Hong Kong,Señorita,Shawn Mendes & Camila Cabello,Señorita - Single,Pop,1,Apple
1,Hong Kong,Into the Unknown,Idina Menzel & AURORA,Frozen 2 (Original Motion Picture Soundtrack /...,Soundtrack,2,Apple
2,Hong Kong,說好不哭,周杰倫 & 阿信,說好不哭 - Single,Pop,3,Apple
3,Hong Kong,Show Yourself,Idina Menzel & Evan Rachel Wood,Frozen 2 (Original Motion Picture Soundtrack /...,Soundtrack,4,Apple
4,Hong Kong,讓愛高飛 (劇集《多功能老婆》片尾曲),周柏豪,讓愛高飛 (劇集《多功能老婆》片尾曲) - Single,Guangdong song,5,Apple
5,Netherlands,Dance Monkey,Tones and I,Dance Monkey - Single,Alternative,1,Apple
6,Netherlands,Lucid Dreams,Juice WRLD,Goodbye & Good Riddance,Hiphop / Rap,2,Apple
7,Netherlands,everything i wanted,Billie Eilish,everything i wanted - Single,Alternative,3,Apple
8,Netherlands,Blinding Lights,The Weeknd,Blinding Lights - Single,R&B/Soul,4,Apple
9,Netherlands,All I Want For Christmas Is You,Mariah Carey,Merry Christmas,Holidays,5,Apple


In [5]:
#start of lastFM data search

jsonFormat = '&format=json'

url = 'http://ws.audioscrobbler.com/2.0/?method=chart.gettoptracks'
url_end = '&api_key=' + lastfm_api_key + jsonFormat

In [6]:
limit = '5'
countries = ["Hong Kong", "Netherlands", "Australia", 
             "Canada", "France", "Japan", "united kingdom", "Germany", "united states"]
song_list = []
artist_list = []
country_list = []
rank = []

for country in countries:
    top_list = []
    urlgeo = 'http://ws.audioscrobbler.com/2.0/?method=geo.gettoptracks&limit=' + limit + '&country=' + country + url_end
    try:
        lastfm_response = requests.get(urlgeo).json()
        top_list.append(lastfm_response["tracks"]["track"])
        temp_df = pd.DataFrame(top_list)
        for x in range (0,5):
            rank.append(x+1)
            song_list.append(temp_df[x][0]['name'])
            country_list.append(country)
            artist_list.append(temp_df[x][0]['artist']['name'])
    except:
        print(f"Couldn't find {country}")

In [7]:
lastfm_top_df=pd.DataFrame({
    "Country" : country_list,
    "Artist" : artist_list,
    "Name" : song_list,
    "Rank" : rank
})

lastfm_top_df["Source"] = "LastFM"

lastfm_top_df.head()

Unnamed: 0,Country,Artist,Name,Rank,Source
0,Hong Kong,Adele,Hello,1,LastFM
1,Hong Kong,The Weeknd,Can't Feel My Face,2,LastFM
2,Hong Kong,Ed Sheeran,Photograph,3,LastFM
3,Hong Kong,Ed Sheeran,Thinking Out Loud,4,LastFM
4,Hong Kong,Ed Sheeran,Shape of You,5,LastFM


In [10]:
albums = []
for index, row in lastfm_top_df.iterrows():
    track_url = f"http://ws.audioscrobbler.com/2.0/?method=track.getInfo&artist={row['Artist']}&track={row['Name']}&autocorrect=1" + url_end
    lastfm_response = requests.get(track_url).json()
    albums.append(lastfm_response['track']['album']['title'])

In [11]:
lastfm_response['track']['album']['title']

'25'

In [12]:
lastfm_top_df["Album"] = albums

In [13]:
lastfm_top_df.replace(to_replace="united kingdom", value="UK", inplace=True)
lastfm_top_df.replace(to_replace="united states", value="US", inplace=True)

In [14]:
#start of spotify data search

spotify = spotipy.Spotify()
credentials = oauth2.SpotifyClientCredentials(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET)

token = credentials.get_access_token()
spotify = spotipy.Spotify(auth=token)

In [15]:
spotify =spotipy.Spotify(auth=token)
sp1=spotify.search(q="year:2019",type = "track",limit=10,market="us")\

song = []
album = []
pop=[]
artist=[]
for i,t in enumerate (sp1["tracks"]["items"]):
    name_1=(i,t["name"])
    album_1=(t["album"]["name"])
    song.append(name_1[1])
    album.append(album_1)
    pop.append(t["popularity"])
    artist.append(t["artists"][0]["name"])
final = {
    "Name": song,
    "Album":album,
    "Popularity":pop,
    "Artist":artist
}

df = pd.DataFrame(final)
new_df=df.sort_values(["Popularity"],ascending=False)

new_df

Unnamed: 0,Name,Album,Popularity,Artist
8,Memories,Memories,100,Maroon 5
0,Circles,Hollywood's Bleeding,99,Post Malone
1,HIGHEST IN THE ROOM,HIGHEST IN THE ROOM,98,Travis Scott
4,everything i wanted,everything i wanted,98,Billie Eilish
5,Lose You To Love Me,Lose You To Love Me,98,Selena Gomez
2,ROXANNE,ROXANNE,97,Arizona Zervas
6,hot girl bummer,hot girl bummer,96,blackbear
3,Bandit (with YoungBoy Never Broke Again),Bandit (with YoungBoy Never Broke Again),93,Juice WRLD
7,BOP,KIRK,91,DaBaby
9,223's (feat. 9lokknine),223's (feat. 9lokknine),90,YNW Melly


In [16]:
playlist_name = []
p_id = []
owners_p=[]

playlists=spotify.search(q="Top 50",type = "playlist",limit=50)
for x,y in enumerate(playlists["playlists"]["items"]):
    play_name = (x,y["name"])
    playlist_id = y["uri"]
    owners = (y["owner"]["display_name"])
    #owner_final=owners[]
    if owners == "spotifycharts":
        playlist_name.append(play_name[1])
        p_id.append(playlist_id)
        owners_p.append(owners)

play_final = {
    "Playlist Name":playlist_name,
    "Playlist_ID":p_id,
    "Owners":owners_p
}
playlist_df = pd.DataFrame(play_final)
playlist_df

Unnamed: 0,Playlist Name,Playlist_ID,Owners
0,Global Top 50,spotify:playlist:37i9dQZEVXbMDoHDwVN2tF,spotifycharts
1,Brazil Top 50,spotify:playlist:37i9dQZEVXbMXbN3EUUhlg,spotifycharts
2,Spain Top 50,spotify:playlist:37i9dQZEVXbNFJfN1Vw8d9,spotifycharts
3,Germany Top 50,spotify:playlist:37i9dQZEVXbJiZcmkrIHGU,spotifycharts
4,Mexico Top 50,spotify:playlist:37i9dQZEVXbO3qyFxbkOE1,spotifycharts
5,Netherlands Top 50,spotify:playlist:37i9dQZEVXbKCF6dqVpDkS,spotifycharts
6,Argentina Top 50,spotify:playlist:37i9dQZEVXbMMy2roB9myp,spotifycharts
7,Australia Top 50,spotify:playlist:37i9dQZEVXbJPcfkRz0wJ0,spotifycharts
8,Italy Top 50,spotify:playlist:37i9dQZEVXbIQnj7RRhdSX,spotifycharts
9,Japan Top 50,spotify:playlist:37i9dQZEVXbKXQ4mDTEBXq,spotifycharts


In [17]:
playlist_name_t = []
p_id_t = []
owners_p_t=[]

playlists=spotify.search(q="Hong Kong Top 50",type = "playlist",limit=50)
for x,y in enumerate(playlists["playlists"]["items"]):
    play_name = (x,y["name"])
    playlist_id = y["uri"]
    owners = (y["owner"]["display_name"])
    #owner_final=owners[]
    if owners == "spotifycharts":
        playlist_name_t.append(play_name[1])
        p_id_t.append(playlist_id)
        owners_p_t.append(owners)

play_final_t = {
    "Playlist Name":playlist_name_t,
    "Playlist_ID":p_id_t,
    "Owners":owners_p_t
}
playlist_df_t = pd.DataFrame(play_final_t)
playlist_df_t.tail()

Unnamed: 0,Playlist Name,Playlist_ID,Owners
0,Hong Kong Top 50,spotify:playlist:37i9dQZEVXbLwpL8TjsxOG,spotifycharts
1,Hong Kong Viral 50,spotify:playlist:37i9dQZEVXbKXd6qahcpCg,spotifycharts


In [18]:
final_playists_df = pd.concat([playlist_df,playlist_df_t])
final_playists_df.reset_index(inplace=True, drop=True)
final_playists_df

Unnamed: 0,Playlist Name,Playlist_ID,Owners
0,Global Top 50,spotify:playlist:37i9dQZEVXbMDoHDwVN2tF,spotifycharts
1,Brazil Top 50,spotify:playlist:37i9dQZEVXbMXbN3EUUhlg,spotifycharts
2,Spain Top 50,spotify:playlist:37i9dQZEVXbNFJfN1Vw8d9,spotifycharts
3,Germany Top 50,spotify:playlist:37i9dQZEVXbJiZcmkrIHGU,spotifycharts
4,Mexico Top 50,spotify:playlist:37i9dQZEVXbO3qyFxbkOE1,spotifycharts
5,Netherlands Top 50,spotify:playlist:37i9dQZEVXbKCF6dqVpDkS,spotifycharts
6,Argentina Top 50,spotify:playlist:37i9dQZEVXbMMy2roB9myp,spotifycharts
7,Australia Top 50,spotify:playlist:37i9dQZEVXbJPcfkRz0wJ0,spotifycharts
8,Italy Top 50,spotify:playlist:37i9dQZEVXbIQnj7RRhdSX,spotifycharts
9,Japan Top 50,spotify:playlist:37i9dQZEVXbKXQ4mDTEBXq,spotifycharts


In [19]:
tracks = []
p_id2=[]
test = []
p_name = []
t_ids = []
tracker=[]
p_list=final_playists_df["Playlist_ID"]
data=p_list.str.split(":")
for x in data:
    id =(x[2])
    p_id2.append(id)

for x in p_id2:
    response_json=spotify.user_playlist_tracks("spotifycharts", x)
    rank=1
    cur_play = final_playists_df.loc[final_playists_df["Playlist_ID"] == f"spotify:playlist:{x}"]["Playlist Name"].unique()[0]
    counter = 0
    for i,t in enumerate (response_json["items"]):
        name=(t["track"]["name"])
        t_id=(t["track"]["id"])
        tracks.append(name)
        t_ids.append(t_id)
        test.append(x)
        p_name.append(cur_play)
        rank = rank + 1
        tracker.append(counter+1)
        counter+=1
        if (rank == 6):
            break

In [20]:
music_df = pd.DataFrame({
    "Playlist Name" : p_name,
    "Playlist ID" : test,
    "Song" : tracks,
    "ID":t_ids,
    "Rank":rank
})
music_df["Playlist Name"] = music_df["Playlist Name"].replace({'United States Top 50' : 'US Top 50', 'United Kingdom Top 50' : 'UK Top 50', 'Hong Kong Viral 50': 'HK Viral 50'})
music_df

Unnamed: 0,Playlist Name,Playlist ID,Song,ID,Rank
0,Global Top 50,37i9dQZEVXbMDoHDwVN2tF,Dance Monkey,1rgnBhdG2JDFTbYkYRZAku,6
1,Global Top 50,37i9dQZEVXbMDoHDwVN2tF,Lucid Dreams,285pBltuF7vW8TeWk8hdRR,6
2,Global Top 50,37i9dQZEVXbMDoHDwVN2tF,ROXANNE,696DnlkuDOXcMAnKlTgXXK,6
3,Global Top 50,37i9dQZEVXbMDoHDwVN2tF,Circles,21jGcNKet2qwijlDFuPiPb,6
4,Global Top 50,37i9dQZEVXbMDoHDwVN2tF,Memories,2b8fOow8UzyDFAE27YhOZM,6
...,...,...,...,...,...
210,HK Viral 50,37i9dQZEVXbKXd6qahcpCg,Lost in the Woods,7namdlOhbtsc8FvoSafOQt,6
211,HK Viral 50,37i9dQZEVXbKXd6qahcpCg,Falling,4TnjEaWOeW0eKTKIEvJyCa,6
212,HK Viral 50,37i9dQZEVXbKXd6qahcpCg,Into the Unknown,3Z0oQ8r78OUaHvGPiDBR3W,6
213,HK Viral 50,37i9dQZEVXbKXd6qahcpCg,All Is Found,61HVbcNeRACZpyvHrc3AnD,6


In [21]:
country = []
Range=music_df["Playlist Name"].str.split(" ")[:,]

for x in Range:
    n_c=(x[0])
    country.append(n_c)
music_df["Country"]=country
music_df
music_df.to_csv("music.csv")

music_df

Unnamed: 0,Playlist Name,Playlist ID,Song,ID,Rank,Country
0,Global Top 50,37i9dQZEVXbMDoHDwVN2tF,Dance Monkey,1rgnBhdG2JDFTbYkYRZAku,6,Global
1,Global Top 50,37i9dQZEVXbMDoHDwVN2tF,Lucid Dreams,285pBltuF7vW8TeWk8hdRR,6,Global
2,Global Top 50,37i9dQZEVXbMDoHDwVN2tF,ROXANNE,696DnlkuDOXcMAnKlTgXXK,6,Global
3,Global Top 50,37i9dQZEVXbMDoHDwVN2tF,Circles,21jGcNKet2qwijlDFuPiPb,6,Global
4,Global Top 50,37i9dQZEVXbMDoHDwVN2tF,Memories,2b8fOow8UzyDFAE27YhOZM,6,Global
...,...,...,...,...,...,...
210,HK Viral 50,37i9dQZEVXbKXd6qahcpCg,Lost in the Woods,7namdlOhbtsc8FvoSafOQt,6,HK
211,HK Viral 50,37i9dQZEVXbKXd6qahcpCg,Falling,4TnjEaWOeW0eKTKIEvJyCa,6,HK
212,HK Viral 50,37i9dQZEVXbKXd6qahcpCg,Into the Unknown,3Z0oQ8r78OUaHvGPiDBR3W,6,HK
213,HK Viral 50,37i9dQZEVXbKXd6qahcpCg,All Is Found,61HVbcNeRACZpyvHrc3AnD,6,HK


In [22]:
m_country=["Hong","Netherlands","Russia","Australia","Canada","France","Japan","Germany","US", "UK"]
final_music_1=music_df.loc[music_df["Country"].isin(m_country)]
spotify_top_df =final_music_1.reset_index()

spotify_top_df

Unnamed: 0,index,Playlist Name,Playlist ID,Song,ID,Rank,Country
0,15,Germany Top 50,37i9dQZEVXbJiZcmkrIHGU,KEIN SCHLAF,6416zJN0FGPmh1Ph4BH2h3,6,Germany
1,16,Germany Top 50,37i9dQZEVXbJiZcmkrIHGU,COLT,76IVpz47q3ghkxoUeTTEKb,6,Germany
2,17,Germany Top 50,37i9dQZEVXbJiZcmkrIHGU,Roller,6hw1Sy9wZ8UCxYGdpKrU6M,6,Germany
3,18,Germany Top 50,37i9dQZEVXbJiZcmkrIHGU,Der Bratan bleibt der gleiche,2M39v73rNJmAXUXIW0oagP,6,Germany
4,19,Germany Top 50,37i9dQZEVXbJiZcmkrIHGU,Dance Monkey,1rgnBhdG2JDFTbYkYRZAku,6,Germany
5,25,Netherlands Top 50,37i9dQZEVXbKCF6dqVpDkS,Dance Monkey,1rgnBhdG2JDFTbYkYRZAku,6,Netherlands
6,26,Netherlands Top 50,37i9dQZEVXbKCF6dqVpDkS,Lucid Dreams,285pBltuF7vW8TeWk8hdRR,6,Netherlands
7,27,Netherlands Top 50,37i9dQZEVXbKCF6dqVpDkS,Pa Olvidarte,1RO8Q3w2Jkir8cvvGEApI0,6,Netherlands
8,28,Netherlands Top 50,37i9dQZEVXbKCF6dqVpDkS,ROXANNE,696DnlkuDOXcMAnKlTgXXK,6,Netherlands
9,29,Netherlands Top 50,37i9dQZEVXbKCF6dqVpDkS,Memories,2b8fOow8UzyDFAE27YhOZM,6,Netherlands


In [25]:
spotify_top_df = spotify_top_df.rename(columns={'Song':'Name'})

In [27]:
spot_album_list = []
spot_genre_list = []
spot_artist_list = []

for song in spotify_top_df["Name"]:
    artist = f"term={song}"
    media = 'media=music'
    entity = 'entity=song'
    limit = 'limit=1'

    url = 'http://itunes.apple.com/search?' + artist + "&" + media + "&" + entity + "&" + limit
    response = requests.get(url).json()

    spot_genre_list.append(response["results"][0]['primaryGenreName'])
    spot_album_list.append(response["results"][0]["collectionName"])
    spot_artist_list.append(response["results"][0]['artistName'])

IndexError: list index out of range

In [None]:
spotify_top_df["Genre"] = spot_genre_list
spotify_top_df["Album"] = spot_album_list
spotify_top_df["Artist"] = spot_artist_list
spotify_top_df["Source"] = 'Spotify'
spotify_top_df['Rank'] = np.arange(len(spotify_top_df))
spotify_top_df['Rank'] = spotify_top5['Rank'] % 5 + 1

spotify_top_df = spotify_top_df[['Country', 'Name', 'Artist', 'Album', 'Genre', 'Rank', 'Source']]
spotify_top_df['Country'] = spotify_top_df['Country'].replace({'Hong' : 'Hong Kong'}) 


spotify_top_df

In [None]:
genre_list = []

for song in lastfm_top_df["Name"]:
    artist = f"term={song}"
    media = 'media=music'
    entity = 'entity=song'
    limit = 'limit=1'

    url = 'http://itunes.apple.com/search?' + artist + "&" + media + "&" + entity + "&" + limit
    response = requests.get(url).json()

    genre_list.append(response["results"][-1]['primaryGenreName'])

In [None]:
lastfm_top_df["Genre"] = genre_list
lastfm_top_df = lastfm_top_df[['Country', 'Name', 'Artist', 'Album', 'Genre', 'Rank', 'Source']]

lastfm_top_df

In [None]:
music = [apple_top_df, lastfm_top_df, spotify_top_df]

merge_df = pd.concat(music)

merge_df

In [None]:
spotify_id = []
for index, row in merge_df.iterrows():
    try:
        res = spotify.search(row["Name"], type="track", limit=1)
        spotify_id.append(res["tracks"]["items"][0]["id"])
    except:
        text = row["Name"]
        head, sep, tail = text.partition('(')
        res = spotify.search(head, type="track", limit=1)
        spotify_id.append(res["tracks"]["items"][0]["id"])
        continue
        
merge_df["Spotify ID"] = spotify_id
merge_df.head()

In [None]:
danceability = []
energy = []
key = []
loudness = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
duration_ms = []

track_features = spotify.audio_features(tracks='3Z0oQ8r78OUaHvGPiDBR3W')

pprint(track_features)

In [None]:
for index, row in merge_df.iterrows():
    try:
        track_features = spotify.audio_features(tracks=row['Spotify ID'])
        danceability.append(track_features[0]["danceability"])
        energy.append(track_features[0]["energy"])
        key.append(track_features[0]["key"])
        loudness.append(track_features[0]["loudness"])
        speechiness.append(track_features[0]["speechiness"])
        acousticness.append(track_features[0]["acousticness"])
        instrumentalness.append(track_features[0]["instrumentalness"])
        liveness.append(track_features[0]["liveness"])
        valence.append(track_features[0]["valence"])
        tempo.append(track_features[0]["tempo"])
        duration_ms.append(track_features[0]["duration_ms"])
    except:
        print(f"Couldn't find details for {row['Spotify ID']}")
        continue

In [None]:
merge_df

In [None]:
output_file = merge_df.to_csv('track_analysis.csv')