# Introduction
Top 100 songs of the year 2006 - 2020

## Project description

Here is supposed to be project description

## Importing libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import wget
import glob
import os
from twython import Twython
from credentials import * 
import requests
import json

# Downloading data and scraping HTML websites 
Downloading HTML webpages with top 100 songs by year from www.billboard.com
- url - https://www.billboard.com/charts/year-end/hot-100-songs

## Downloading data

Here download data

## Scraping
- Scrape data from downloaded webpages and transform it into a `dataframe`

In [2]:
def get_webpage_data(start_year, end_year):
    
    df_full = pd.DataFrame()
    
    for i in range(start_year,end_year+1):
        year = i
    
        year = str(year)

        web_page = BeautifulSoup(open('../Top-songs-2006-2020/HTMLs/Top_100_'+year+'.html'), 'html.parser')

        all_titles = web_page.select('.ye-chart-item__title')
        all_artist = web_page.select('.ye-chart-item__artist')
        all_ranks = web_page.select('.ye-chart-item__rank')

        ranks_list = []
        for rank in all_ranks:
            ranks_list.append(rank.get_text().replace('\n', ''))

        titles_list = []
        for title in all_titles:
            titles_list.append(title.get_text().replace('\n', ''))

        artist_list = []
        for artist in all_artist:
            artist_list.append(artist.get_text().replace('\n', ''))

        year_list = []
        for year in web_page.select('.ye-chart-item__primary-row'):
            year_list.append(year['data-date'])

        df = pd.DataFrame(data = [ranks_list, titles_list, artist_list, year_list]).T
        df.columns = ['Rank', 'Title', 'Artist', 'Year']

        df_full = df_full.append(df)
        
    return df_full

In [3]:
full_df = get_webpage_data(2006,2020)
full_df.reset_index(drop = True, inplace = True)
full_df

Unnamed: 0,Rank,Title,Artist,Year
0,1,Bad Day,Daniel Powter,2006
1,2,Temperature,Sean Paul,2006
2,3,Promiscuous,Nelly Furtado Featuring Timbaland,2006
3,4,You're Beautiful,James Blunt,2006
4,5,Hips Don't Lie,Shakira Featuring Wyclef Jean,2006
...,...,...,...,...
1493,96,More Than My Hometown,Morgan Wallen,2020
1494,97,Lovin' On You,Luke Combs,2020
1495,98,Said Sum,Moneybagg Yo,2020
1496,99,Slide,H.E.R. Featuring YG,2020


# Spotify API

## Set up API

In [4]:
%run spotify_api_secret-Copy1.ipynb

In [258]:
auth_url = 'https://accounts.spotify.com/api/token'

auth_response = requests.post(auth_url, {
    'grant_type':'client_credentials',
    'client_id': client_id,
    'client_secret': client_secret,
})

auth_response_data = auth_response.json()
access_token = auth_response_data['access_token']
headers = {'Authorization': 'Bearer {token}'.format(token = access_token)}

## Extracitng data from Spotify API

### Extracting track's ids, etc.

In [46]:
full_df.head(1)

Unnamed: 0,Rank,Title,Artist,Year
0,1,Bad Day,Daniel Powter,2006


In [62]:
track_names_spot = []
track_ids_spot = []
track_duration_spot = []
track_explicit_spot = []
track_popularity_spot = []
artist_name_spot = []

for track_name in full_df.Title:
    
    url = 'https://api.spotify.com/v1/search'

    track = requests.get(url, headers = headers, params = {'q':track_name, 'type':'track'}).json()
    
    track_names_spot.append(track['tracks']['items'][0]['name'])
    track_ids_spot.append(track['tracks']['items'][0]['id'])
    artist_name_spot.append(track['tracks']['items'][0]['album']['artists'][0]['name'])
    track_duration_spot.append(track['tracks']['items'][0]['duration_ms'])
    track_explicit_spot.append(track['tracks']['items'][0]['explicit'])
    track_popularity_spot.append(track['tracks']['items'][0]['popularity'])

In [77]:
df_spotify = pd.DataFrame(data = [track_names_spot, artist_name_spot, track_ids_spot, 
                    track_duration_spot, track_popularity_spot, track_explicit_spot]).T
df_spotify.columns = ['Title_spot', 'Artist_spot', 'Track_id', 'Track_duration_ms', 
                      'Track_popularity', 'Track_explicit']
df_spotify.head(3)

Unnamed: 0,Title_spot,Artist_spot,Track_id,Track_duration_ms,Track_popularity,Track_explicit
0,Bad Day,Daniel Powter,0mUyMawtxj1CJ76kn9gIZK,233640,75,False
1,Temperature,Sean Paul,0k2GOhqsrxDTAbFFSdNJjT,218573,79,False
2,Promiscuous,Nelly Furtado,2gam98EZKrF9XuOkU13ApN,242293,82,False


### Merging two DataFrames

In [110]:
df_spotify.head(3)

Unnamed: 0,Title_spot,Artist_spot,Track_id,Track_duration_ms,Track_popularity,Track_explicit
0,Bad Day,Daniel Powter,0mUyMawtxj1CJ76kn9gIZK,233640,75,False
1,Temperature,Sean Paul,0k2GOhqsrxDTAbFFSdNJjT,218573,79,False
2,Promiscuous,Nelly Furtado,2gam98EZKrF9XuOkU13ApN,242293,82,False


In [71]:
full_df.head(3)

Unnamed: 0,Rank,Title,Artist,Year
0,1,Bad Day,Daniel Powter,2006
1,2,Temperature,Sean Paul,2006
2,3,Promiscuous,Nelly Furtado Featuring Timbaland,2006


In [114]:
df = full_df.merge(right = df_spotify, left_index = True, right_index = True)
df

Unnamed: 0,Rank,Title,Artist,Year,Title_spot,Artist_spot,Track_id,Track_duration_ms,Track_popularity,Track_explicit
0,1,Bad Day,Daniel Powter,2006,Bad Day,Daniel Powter,0mUyMawtxj1CJ76kn9gIZK,233640,75,False
1,2,Temperature,Sean Paul,2006,Temperature,Sean Paul,0k2GOhqsrxDTAbFFSdNJjT,218573,79,False
2,3,Promiscuous,Nelly Furtado Featuring Timbaland,2006,Promiscuous,Nelly Furtado,2gam98EZKrF9XuOkU13ApN,242293,82,False
3,4,You're Beautiful,James Blunt,2006,You're Beautiful,James Blunt,0vg4WnUWvze6pBOJDTq99k,209493,77,True
4,5,Hips Don't Lie,Shakira Featuring Wyclef Jean,2006,Hips Don't Lie (feat. Wyclef Jean),Shakira,3ZFTkvIE7kyPt6Nu3PEa7V,218093,84,False
...,...,...,...,...,...,...,...,...,...,...
1493,96,More Than My Hometown,Morgan Wallen,2020,More Than My Hometown,Morgan Wallen,5OELUCYgOHKFAvCERnAvfS,216573,76,False
1494,97,Lovin' On You,Luke Combs,2020,Lovin' On You,Luke Combs,0nYvjcSlCgjcwogQAwIwNp,194866,72,False
1495,98,Said Sum,Moneybagg Yo,2020,Said Sum,Moneybagg Yo,3sKz6Sd72K0ofPWcJPPk6H,155168,72,True
1496,99,Slide,H.E.R. Featuring YG,2020,Slide,H.E.R.,2rTnVB1bvwxHtaIl4uVu7f,238321,73,True


### Extracting Audio Features of the tracks

In [261]:
requests_list = []

for track_id in df.Track_id:
    
    url = 'https://api.spotify.com/v1/audio-features/'+track_id
    
    requests_list.append(requests.get(url, headers = headers).json())

In [361]:
df_audio_feautures = pd.DataFrame()

i = 0

for i in range(len(requests_list)):
    
    temp_df = pd.DataFrame.from_dict(requests_list[i], orient = 'Index').T
    
    df_audio_feautures = df_audio_feautures.append(temp_df)
    
    i += 1

In [362]:
df_audio_feautures.reset_index(drop = True, inplace = True)
df_audio_feautures

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,error
0,0.599,0.785,3,-4.013,1,0.0309,0.448,0.00336,0.151,0.52,140.046,audio_features,0mUyMawtxj1CJ76kn9gIZK,spotify:track:0mUyMawtxj1CJ76kn9gIZK,https://api.spotify.com/v1/tracks/0mUyMawtxj1C...,https://api.spotify.com/v1/audio-analysis/0mUy...,233640,4,
1,0.951,0.6,0,-4.675,0,0.0685,0.106,0,0.0712,0.822,125.04,audio_features,0k2GOhqsrxDTAbFFSdNJjT,spotify:track:0k2GOhqsrxDTAbFFSdNJjT,https://api.spotify.com/v1/tracks/0k2GOhqsrxDT...,https://api.spotify.com/v1/audio-analysis/0k2G...,218573,4,
2,0.808,0.97,10,-6.098,0,0.0506,0.0569,0.000061,0.154,0.868,114.328,audio_features,2gam98EZKrF9XuOkU13ApN,spotify:track:2gam98EZKrF9XuOkU13ApN,https://api.spotify.com/v1/tracks/2gam98EZKrF9...,https://api.spotify.com/v1/audio-analysis/2gam...,242293,4,
3,0.675,0.479,0,-9.87,0,0.0278,0.633,0.000018,0.088,0.454,81.998,audio_features,0vg4WnUWvze6pBOJDTq99k,spotify:track:0vg4WnUWvze6pBOJDTq99k,https://api.spotify.com/v1/tracks/0vg4WnUWvze6...,https://api.spotify.com/v1/audio-analysis/0vg4...,209493,4,
4,0.778,0.824,10,-5.892,0,0.0707,0.284,0,0.405,0.758,100.024,audio_features,3ZFTkvIE7kyPt6Nu3PEa7V,spotify:track:3ZFTkvIE7kyPt6Nu3PEa7V,https://api.spotify.com/v1/tracks/3ZFTkvIE7kyP...,https://api.spotify.com/v1/audio-analysis/3ZFT...,218093,4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1504,0.62,0.869,6,-5.479,1,0.0462,0.615,0,0.131,0.597,126.019,audio_features,5OELUCYgOHKFAvCERnAvfS,spotify:track:5OELUCYgOHKFAvCERnAvfS,https://api.spotify.com/v1/tracks/5OELUCYgOHKF...,https://api.spotify.com/v1/audio-analysis/5OEL...,216573,4,
1505,0.572,0.949,4,-4.865,1,0.06,0.00165,0.000195,0.163,0.53,118.974,audio_features,0nYvjcSlCgjcwogQAwIwNp,spotify:track:0nYvjcSlCgjcwogQAwIwNp,https://api.spotify.com/v1/tracks/0nYvjcSlCgjc...,https://api.spotify.com/v1/audio-analysis/0nYv...,194867,4,
1506,0.929,0.667,8,-6.789,0,0.353,0.0185,0,0.1,0.274,126.998,audio_features,3sKz6Sd72K0ofPWcJPPk6H,spotify:track:3sKz6Sd72K0ofPWcJPPk6H,https://api.spotify.com/v1/tracks/3sKz6Sd72K0o...,https://api.spotify.com/v1/audio-analysis/3sKz...,155168,4,
1507,0.827,0.469,10,-9.141,0,0.341,0.0807,0.000008,0.207,0.196,97.028,audio_features,2rTnVB1bvwxHtaIl4uVu7f,spotify:track:2rTnVB1bvwxHtaIl4uVu7f,https://api.spotify.com/v1/tracks/2rTnVB1bvwxH...,https://api.spotify.com/v1/audio-analysis/2rTn...,238321,4,


### Cleaning Spotify data frame

In [363]:
len(df_audio_feautures)

1509

In [364]:
df_audio_feautures.error.unique()[1:]

array([429, 'API rate limit exceeded', 404, 'analysis not found'],
      dtype=object)

In [367]:
len(df_audio_feautures[df_audio_feautures.error == 'API rate limit exceeded'])

10

In [368]:
len(df_audio_feautures[df_audio_feautures.error == 'analysis not found'])

1

In [369]:
len(df_audio_feautures[df_audio_feautures.error == '429'])

0

In [371]:
len(df_audio_feautures[df_audio_feautures.error == '404'])

0

In [373]:
df_audio_feautures.drop(df_audio_feautures[df_audio_feautures.error == 'API rate limit exceeded'].index, 
                    inplace = True)

df_audio_feautures.drop(df_audio_feautures[df_audio_feautures.error == 'analysis not found'].index, 
                    inplace = True)

In [374]:
len(df_audio_feautures)

1498

# Merge dataframes to the final dataframe

In [379]:
df_audio_feautures.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,error
0,0.599,0.785,3,-4.013,1,0.0309,0.448,0.00336,0.151,0.52,140.046,audio_features,0mUyMawtxj1CJ76kn9gIZK,spotify:track:0mUyMawtxj1CJ76kn9gIZK,https://api.spotify.com/v1/tracks/0mUyMawtxj1C...,https://api.spotify.com/v1/audio-analysis/0mUy...,233640,4,
1,0.951,0.6,0,-4.675,0,0.0685,0.106,0.0,0.0712,0.822,125.04,audio_features,0k2GOhqsrxDTAbFFSdNJjT,spotify:track:0k2GOhqsrxDTAbFFSdNJjT,https://api.spotify.com/v1/tracks/0k2GOhqsrxDT...,https://api.spotify.com/v1/audio-analysis/0k2G...,218573,4,
2,0.808,0.97,10,-6.098,0,0.0506,0.0569,6.1e-05,0.154,0.868,114.328,audio_features,2gam98EZKrF9XuOkU13ApN,spotify:track:2gam98EZKrF9XuOkU13ApN,https://api.spotify.com/v1/tracks/2gam98EZKrF9...,https://api.spotify.com/v1/audio-analysis/2gam...,242293,4,
3,0.675,0.479,0,-9.87,0,0.0278,0.633,1.8e-05,0.088,0.454,81.998,audio_features,0vg4WnUWvze6pBOJDTq99k,spotify:track:0vg4WnUWvze6pBOJDTq99k,https://api.spotify.com/v1/tracks/0vg4WnUWvze6...,https://api.spotify.com/v1/audio-analysis/0vg4...,209493,4,
4,0.778,0.824,10,-5.892,0,0.0707,0.284,0.0,0.405,0.758,100.024,audio_features,3ZFTkvIE7kyPt6Nu3PEa7V,spotify:track:3ZFTkvIE7kyPt6Nu3PEa7V,https://api.spotify.com/v1/tracks/3ZFTkvIE7kyP...,https://api.spotify.com/v1/audio-analysis/3ZFT...,218093,4,


In [380]:
df.head()

Unnamed: 0,Rank,Title,Artist,Year,Title_spot,Artist_spot,Track_id,Track_duration_ms,Track_popularity,Track_explicit
0,1,Bad Day,Daniel Powter,2006,Bad Day,Daniel Powter,0mUyMawtxj1CJ76kn9gIZK,233640,75,False
1,2,Temperature,Sean Paul,2006,Temperature,Sean Paul,0k2GOhqsrxDTAbFFSdNJjT,218573,79,False
2,3,Promiscuous,Nelly Furtado Featuring Timbaland,2006,Promiscuous,Nelly Furtado,2gam98EZKrF9XuOkU13ApN,242293,82,False
3,4,You're Beautiful,James Blunt,2006,You're Beautiful,James Blunt,0vg4WnUWvze6pBOJDTq99k,209493,77,True
4,5,Hips Don't Lie,Shakira Featuring Wyclef Jean,2006,Hips Don't Lie (feat. Wyclef Jean),Shakira,3ZFTkvIE7kyPt6Nu3PEa7V,218093,84,False


### What to extract from spotify:
- release_date
- track id
- artist id
- name (of the artist / band)
- type (artist/band)
- duration (seconds)
- genres
- popularity?
- 

## rough notebook

In [11]:
'''
audio_features=[]
for track_id in artist_track_id:
    audio_url= 'https://api.spotify.com/v1/audio-features/'+track_id
    res=requests.get(audio_url,headers=headers)
    audio_info=res.json()
    audio_features.append(audio_info)
'''

"\naudio_features=[]\nfor track_id in artist_track_id:\n    audio_url= 'https://api.spotify.com/v1/audio-features/'+track_id\n    res=requests.get(audio_url,headers=headers)\n    audio_info=res.json()\n    audio_features.append(audio_info)\n"

In [12]:
artist_name = 'Depeche Mode'
url = 'https://api.spotify.com/v1/search'

In [13]:
artist = requests.get(url, params = {'q':artist_name, 'type':'artist', 'market':'DK'}, headers = headers)

In [14]:
artist

<Response [200]>

In [15]:
url = 'https://api.spotify.com/v1/artists'
requests.get(url, headers = headers, params = {'ids':'762310PdDnwsDxAQxzQkfX'}).json()

{'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/762310PdDnwsDxAQxzQkfX'},
   'followers': {'href': None, 'total': 3931004},
   'genres': ['dance rock',
    'new romantic',
    'new wave',
    'permanent wave',
    'synthpop'],
   'href': 'https://api.spotify.com/v1/artists/762310PdDnwsDxAQxzQkfX',
   'id': '762310PdDnwsDxAQxzQkfX',
   'images': [{'height': 640,
     'url': 'https://i.scdn.co/image/2ec1d1c7a48df4244f0ba708eafd28b7afa6166b',
     'width': 640},
    {'height': 320,
     'url': 'https://i.scdn.co/image/ce06360d3762def7812fc5137e4e79c1467858bc',
     'width': 320},
    {'height': 160,
     'url': 'https://i.scdn.co/image/b46a67a3085884321512a7a759b78c54ddb2bb31',
     'width': 160}],
   'name': 'Depeche Mode',
   'popularity': 76,
   'type': 'artist',
   'uri': 'spotify:artist:762310PdDnwsDxAQxzQkfX'}]}

In [45]:
# extracting main info from the songs
'''
track_name = "More Than My Hometown"
url = 'https://api.spotify.com/v1/search'

track = requests.get(url, headers = headers, params = {'q':track_name, 'type':'track'}).json()

track_name = track['tracks']['items'][0]['name']
artist_name = track['tracks']['items'][0]['album']['artists'][0]['name']
track_id = track['tracks']['items'][0]['id']
track_duration_ms = track['tracks']['items'][0]['duration_ms']
track_explicit = track['tracks']['items'][0]['explicit']
track_popularity = track['tracks']['items'][0]['popularity']
'''

'\ntrack_name = "More Than My Hometown"\nurl = \'https://api.spotify.com/v1/search\'\n\ntrack = requests.get(url, headers = headers, params = {\'q\':track_name, \'type\':\'track\'}).json()\n\ntrack_name = track[\'tracks\'][\'items\'][0][\'name\']\nartist_name = track[\'tracks\'][\'items\'][0][\'album\'][\'artists\'][0][\'name\']\ntrack_id = track[\'tracks\'][\'items\'][0][\'id\']\ntrack_duration_ms = track[\'tracks\'][\'items\'][0][\'duration_ms\']\ntrack_explicit = track[\'tracks\'][\'items\'][0][\'explicit\']\ntrack_popularity = track[\'tracks\'][\'items\'][0][\'popularity\']\n'

In [60]:
'''
    track_name = track['tracks']['items'][0]['name']
    artist_name = track['tracks']['items'][0]['album']['artists'][0]['name']
    track_id = track['tracks']['items'][0]['id']
    track_duration_ms = track['tracks']['items'][0]['duration_ms']
    track_explicit = track['tracks']['items'][0]['explicit']
    track_popularity = track['tracks']['items'][0]['popularity']
    '''

"\n    track_name = track['tracks']['items'][0]['name']\n    artist_name = track['tracks']['items'][0]['album']['artists'][0]['name']\n    track_id = track['tracks']['items'][0]['id']\n    track_duration_ms = track['tracks']['items'][0]['duration_ms']\n    track_explicit = track['tracks']['items'][0]['explicit']\n    track_popularity = track['tracks']['items'][0]['popularity']\n    "

In [166]:
'''

url = 'https://api.spotify.com/v1/audio-features'

danceability = [] #555
energy = [] #461
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []

for track_id in df.Track_id:
    track_request = requests.get(url, headers = headers, params = {'ids':track_id}).json()
    
    danceability.append(track_request['audio_features'][0]['danceability'])
    energy.append(track_request['audio_features'][0]['energy'])
    
'''

"\n\nurl = 'https://api.spotify.com/v1/audio-features'\n\ndanceability = [] #555\nenergy = [] #461\nkey = []\nloudness = []\nmode = []\nspeechiness = []\nacousticness = []\ninstrumentalness = []\nliveness = []\nvalence = []\ntempo = []\n\nfor track_id in df.Track_id:\n    track_request = requests.get(url, headers = headers, params = {'ids':track_id}).json()\n    \n    danceability.append(track_request['audio_features'][0]['danceability'])\n    energy.append(track_request['audio_features'][0]['energy'])\n    \n"

In [377]:
'''
# sito nereikia
danceability = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []

for track_data in requests_list:
    
    i = 0
    
    for i in range(len(requests_list)):
        
        danceability.append(requests_list[i]['danceability'])
        energy.append(requests_list[i]['energy'])
        key.append(requests_list[i]['key'])
        loudness.append(requests_list[i]['loudness'])
        mode.append(requests_list[i]['mode'])
        speechiness.append(requests_list[i]['speechiness'])
        acousticness.append(requests_list[i]['acousticness'])
        instrumentalness.append(requests_list[i]['instrumentalness'])
        liveness.append(requests_list[i]['liveness'])
        valence.append(requests_list[i]['valence'])
        tempo.append(requests_list[i]['tempo'])
        
        i += 1
        
'''

"\n# sito nereikia\ndanceability = []\nenergy = []\nkey = []\nloudness = []\nmode = []\nspeechiness = []\nacousticness = []\ninstrumentalness = []\nliveness = []\nvalence = []\ntempo = []\n\nfor track_data in requests_list:\n    \n    i = 0\n    \n    for i in range(len(requests_list)):\n        \n        danceability.append(requests_list[i]['danceability'])\n        energy.append(requests_list[i]['energy'])\n        key.append(requests_list[i]['key'])\n        loudness.append(requests_list[i]['loudness'])\n        mode.append(requests_list[i]['mode'])\n        speechiness.append(requests_list[i]['speechiness'])\n        acousticness.append(requests_list[i]['acousticness'])\n        instrumentalness.append(requests_list[i]['instrumentalness'])\n        liveness.append(requests_list[i]['liveness'])\n        valence.append(requests_list[i]['valence'])\n        tempo.append(requests_list[i]['tempo'])\n        \n        i += 1\n        \n"

# Notes
- add spotify API key and secret to different jupyter notebook and run it form here