# Parse website

In [None]:
from bs4 import BeautifulSoup
import requests
url = "https://kworb.net/spotify/country/global_weekly_totals.html"
response = requests.get(url)
response.text[:100] # Access the HTML with the text property
#print(response.text)

In [182]:
# Some code from here https://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/
import pandas as pd
from bs4 import BeautifulSoup
class HTMLTableParser:

    def parse_url(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        return [(table,self.parse_html_table(table))\
                for table in soup.find_all('table')]  

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                if column_marker == 1:
                    df.iat[row_marker,column_marker] = column.get_text() + "||" + column.find_all('a')[1]['href']
                else:
                    df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1

        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass

        return df


In [183]:
hp = HTMLTableParser()
table = hp.parse_url("https://kworb.net/spotify/country/global_weekly_totals.html")[0][1]

  self._sock = None


In [184]:
table['Artist and Title'][0].split("||")[1].split('/')[2].split('.')[0]

'7qiZfU4dY1lWllzX7mPBI3'

In [185]:
table.to_csv("new_data.csv", encoding="utf-8", header=True, index=False)

# Integrate with Spotify API

In [186]:
from __future__ import print_function    # (at top of module)
import warnings
warnings.filterwarnings('always')
from spotipy.oauth2 import SpotifyClientCredentials
import json
import spotipy
import time
import sys
import csv
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import seaborn as sns
import config


# Spotify API Setup
client_credentials_manager = SpotifyClientCredentials(config.client_id, config.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# Enables verbose JSON requests tracing
sp.trace=False

In [187]:
# File name to write to
fileName = "newdata_combined_2013_2019.csv"

# Columns for my pandas DataFrame in which we will keep the data
columns = ["song_id","song_title", "artist", "popularity", "total_no_streams", "energy", "liveness", "tempo"
          , "speechiness", "acousticness", "instrumentalness", "time_signature", "danceability",
          "key", "duration", "loudness", "valence", "mode"]

# Actual data structure for the data
myData = []

count = 0
for index, row in table.iterrows():
    track = sp.track("spotify:track:" + row['Artist and Title'].split("||")[1].split('/')[2].split('.')[0])
    #print(result['tracks']['items'][0]['name'])

    trackId = track['uri']
    songTitle = track['name']
    popularity = int(track['popularity'])

    # A song might have more than one artist so we make a list of all of them
    artistName = []
    for artist in track['artists']:
        artistName.append(artist['name'])

    # Get features for the track
    features = sp.audio_features([trackId])

    # If the feature array is empty this usually means something has gone wrong 
    # with the request so this stops the program from failing in that case
    if features[0] != None :
        energy = features[0]['energy']
        liveness = features[0]['liveness'] 
        tempo = features[0]['tempo']
        speechiness = features[0]['speechiness']
        acousticness = features[0]['acousticness']
        instrumentalness = features[0]['instrumentalness']
        time_signature = features[0]['time_signature']
        danceability = features[0]['danceability']
        key = features[0]['key']
        duration_ms = features[0]['duration_ms']
        loudness = features[0]['loudness']
        valence = features[0]['valence']
        mode = features[0]['mode']
        total_no_streams = int(row['Total'].replace(',', ''))
    newRow = [trackId,songTitle, artistName, popularity, total_no_streams,energy, liveness, tempo, speechiness, acousticness, instrumentalness, time_signature,
              danceability, key, duration_ms, loudness, valence, mode]
    #print(newRow)
    # Add the new row to our existing data
    myData.append(newRow)
    count += 1
    if(count % 100 == 0):
        print("Processed " + str(count) + " songs so far.")
    time.sleep(0.1)
print("Finished processing.")
df = pd.DataFrame(myData, columns=columns)
df.head()
print("Writing file to CSV...")
df.to_csv(fileName, encoding="utf-8", header=True, index=False)
print("Done.")
df.head()

Processed 100 songs so far.
Processed 200 songs so far.
Processed 300 songs so far.
Processed 400 songs so far.
Processed 500 songs so far.
Processed 600 songs so far.
Processed 700 songs so far.
Processed 800 songs so far.
Processed 900 songs so far.
Processed 1000 songs so far.
Processed 1100 songs so far.
Processed 1200 songs so far.
Processed 1300 songs so far.
Processed 1400 songs so far.
Processed 1500 songs so far.
Processed 1600 songs so far.
Processed 1700 songs so far.
Processed 1800 songs so far.
Processed 1900 songs so far.
Processed 2000 songs so far.
Processed 2100 songs so far.
Processed 2200 songs so far.
Processed 2300 songs so far.
Processed 2400 songs so far.
Processed 2500 songs so far.
Processed 2600 songs so far.
Processed 2700 songs so far.
Processed 2800 songs so far.
Processed 2900 songs so far.
Processed 3000 songs so far.
Processed 3100 songs so far.
Finished processing.
Writing file to CSV...
Done.


Unnamed: 0,song_id,song_title,artist,popularity,total_no_streams,energy,liveness,tempo,speechiness,acousticness,instrumentalness,time_signature,danceability,key,duration,loudness,valence,mode
0,spotify:track:7qiZfU4dY1lWllzX7mPBI3,Shape of You,[Ed Sheeran],85,2052905128,0.652,0.0931,95.977,0.0802,0.581,0.0,4,0.825,1,233713,-3.183,0.931,0
1,spotify:track:1xznGGDReH1oQq0xzbwXa3,One Dance,"[Drake, WizKid, Kyla]",25,1555622901,0.619,0.351,103.989,0.0532,0.00784,0.00423,4,0.791,1,173987,-5.886,0.371,1
2,spotify:track:7BKLCZ1jbUBVqRi2FVlTVw,Closer,"[The Chainsmokers, Halsey]",84,1454865291,0.524,0.111,95.01,0.0338,0.414,0.0,4,0.748,8,244960,-5.599,0.661,1
3,spotify:track:7wGoVu4Dady5GV0Sv4UIsx,rockstar,"[Post Malone, 21 Savage]",26,1422808226,0.522,0.142,159.772,0.0984,0.13,9e-05,4,0.577,5,218320,-6.594,0.119,0
4,spotify:track:2XW4DbS6NddZxRPm5rMCeY,God's Plan,[Drake],28,1189072288,0.448,0.558,77.176,0.103,0.0309,8.7e-05,4,0.758,7,198960,-9.441,0.373,1


In [189]:
# Read the data from the CSV to make sure everything is fine
data = pd.read_csv("google_cleaned_data.csv")
data.head()

Unnamed: 0,song_id,song_title,artist,popularity,total_no_streams,energy,liveness,tempo,speechiness,acousticness,instrumentalness,time_signature,danceability,key,duration,loudness,valence,mode
0,spotify:track:1nX9KhK3Fff27SnrIor2Yb,04:00,"['2 Chainz', 'Travis Scott']",70,53208336,0.5,0.155,75.012,0.425,0.118,0.0,4,0.796,1,255560,-7.21,0.227,1
1,spotify:track:3uvypVUsiIr1B0BccIcsEh,06:00,"['J Balvin', 'Farruko']",68,50235662,0.746,0.274,175.965,0.102,0.152,0.0,4,0.746,5,243227,-5.046,0.733,0
2,spotify:track:2wOXxtHZgRkkrkEbKLzzqs,1.5,['21 Savage'],76,5894178,0.52,0.454,84.022,0.359,0.000244,1e-06,4,0.885,1,148656,-8.353,0.306,1
3,spotify:track:4uCAKqQQYJaIvyzq4sZkaO,9,['Drake'],46,20334413,0.71,0.237,99.994,0.0437,0.000907,0.000783,4,0.699,2,255787,-7.381,0.0388,1
4,spotify:track:3JjnGLK8IxkNLvo8Lb3KOM,18,['One Direction'],68,6363369,0.582,0.119,124.038,0.0313,0.22,0.0,4,0.681,6,248360,-7.76,0.221,0


In [None]:
print("Number of entries in original data: " + str(len(data.index)))
data['popularity'] = pd.to_numeric(data['popularity'])