# Machine Learning Extension from Netherlands_SpotifyChart_EDA

__Summary of Dataset__
1. Spotify dataset from bootcamp including the following:
  - name (i.e. Title of song)
  - artists
  - popularity
  - danceability
  - valence
  - energy
  - explicit
  - key
  - liveness
  - loudness
  - speachiness
  - tempo

__Summary of Kaggle Spotify Chart Dataset__
1. Data taken from Kaggle: https://www.kaggle.com/datasets/dhruvildave/spotify-charts
2. CSV file includes Top 200 and Viral 50 data from 1 January 2017 till 31 July 2021
3. Data organised into following columns:
  - Title of song
  - Rank of song in Top 200 or Viral 50
  - Date
  - Artist name
  - URL of song
  - Region (Country)
  - Chart
  - Whether the song has moved up, down etc. in chart
  - Number of streams


# Load libraries and data

In [1]:
# Data Manipulation
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from PIL import Image
import cufflinks as cf
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)
cf.go_offline()

import matplotlib.dates as mdates
from unidecode import unidecode

# Pipeline and Column Transformers
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn import set_config
set_config(display = "diagram")

# Scaling
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

# Cross Validation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

# Unsupervised Learning
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans 

# STATISTICS
from statsmodels.graphics.gofplots import qqplot
# This function plots sample against a Normal distribution, 
# to see whether sample is normally distributed or not

In [18]:
spotify = pd.read_csv('data/ML_spotify_data.csv')
spotify.sample(10)

Unnamed: 0,name,artists,popularity,danceability,valence,energy,explicit,key,liveness,loudness,speechiness,tempo
3330,Dreamin',"['George Benson', 'Earl Klugh']",30,0.633,0.928,0.504,0,9,0.0337,-19.682,0.0349,101.662
9933,The Difference,['Tyler Rich'],64,0.535,0.705,0.749,0,9,0.108,-5.344,0.0525,169.778
5964,Aladdin: Prince Ali (arr. for orchestra),"['Alan Menken', 'Anders Soldh', 'Gävle Symphon...",37,0.345,0.76,0.284,0,9,0.12,-16.333,0.0489,85.307
7939,Feel Like Making Love,['Bad Company'],45,0.452,0.311,0.635,0,7,0.754,-7.444,0.0321,81.825
9716,All Alone,['Freddie Dredd'],61,0.69,0.813,0.88,0,1,0.126,-3.525,0.284,159.933
5684,Ileso,['Control Machete'],33,0.485,0.507,0.897,0,10,0.64,-8.372,0.494,89.363
9803,Moonwalking in Calabasas,['DDG'],64,0.915,0.736,0.37,1,11,0.171,-8.696,0.0762,130.033
5588,Rock This Town,['The Brian Setzer Orchestra'],37,0.372,0.433,0.96,0,7,0.352,-5.696,0.111,104.29
3452,Kissin' Dynamite,['AC/DC'],35,0.469,0.298,0.843,0,11,0.0696,-5.982,0.0409,122.324
7492,You'll Never Walk Alone - Mono; 2002 Remaster,['Gerry & The Pacemakers'],56,0.484,0.285,0.265,0,0,0.149,-11.101,0.0322,113.564


In [3]:
cols_int = ['title','rank','date','artist','region','chart','streams']

chart = pd.read_csv('data/charts.csv',usecols=cols_int)
chart.head()

Unnamed: 0,title,rank,date,artist,region,chart,streams
0,Chantaje (feat. Maluma),1,2017-01-01,Shakira,Argentina,top200,253019.0
1,Vente Pa' Ca (feat. Maluma),2,2017-01-01,Ricky Martin,Argentina,top200,223988.0
2,Reggaetón Lento (Bailemos),3,2017-01-01,CNCO,Argentina,top200,210943.0
3,Safari,4,2017-01-01,"J Balvin, Pharrell Williams, BIA, Sky",Argentina,top200,173865.0
4,Shaky Shaky,5,2017-01-01,Daddy Yankee,Argentina,top200,153956.0


# Clean data

In [19]:
spotify = spotify.rename(columns={'name': 'title', 'artists': 'artist'})
spotify.head(2)

Unnamed: 0,title,artist,popularity,danceability,valence,energy,explicit,key,liveness,loudness,speechiness,tempo
0,We're For The Dark - Remastered 2010,['Badfinger'],22,0.678,0.559,0.432,0,3,0.0727,-12.696,0.0334,117.674
1,Sixty Years On - Piano Demo,['Elton John'],25,0.456,0.259,0.368,0,6,0.156,-10.692,0.028,143.783


In [20]:
print(type(spotify.loc[1,'artist']))

<class 'str'>


In [27]:
test = spotify.copy()

test['artist'] = test['artist'].str.strip("[]").str.replace("'", "")

test.sample(5)

Unnamed: 0,title,artist,popularity,danceability,valence,energy,explicit,key,liveness,loudness,speechiness,tempo
182,"Tenha Fé, Pois Amanhã um Lindo Dia Vai Nascer",Os Originais Do Samba,42,0.77,0.967,0.838,0,5,0.142,-8.654,0.0411,93.143
7705,Better Believe It (feat. Young Jeezy & Webbie),"Boosie Badazz, Webbie, Jeezy",40,0.661,0.279,0.817,1,2,0.217,-5.783,0.0489,124.978
760,I Am A Clown,David Cassidy,25,0.393,0.226,0.305,0,7,0.148,-14.57,0.0287,105.787
7688,Swimming,Florence + The Machine,40,0.402,0.343,0.862,0,9,0.111,-4.202,0.261,109.23
6690,Take Me to the River - 2003 Remaster,Talking Heads,40,0.651,0.862,0.446,0,6,0.058,-9.571,0.0268,99.896


In [31]:
test.sample(25)

Unnamed: 0,title,artist,popularity,danceability,valence,energy,explicit,key,liveness,loudness,speechiness,tempo
4252,Secretos,Alejandro Lerner,53,0.633,0.322,0.437,0,3,0.12,-10.175,0.0261,120.075
2983,Save Black Music,Steel Pulse,25,0.758,0.862,0.47,0,7,0.0414,-10.336,0.217,191.962
6721,Clair de Lune,Michael Dulin,38,0.452,0.0647,0.0464,0,1,0.115,-18.639,0.044,72.197
7997,Like It's Her Birthday,Good Charlotte,50,0.592,0.815,0.868,0,11,0.144,-2.983,0.0679,109.985
4435,Ámame Hoy,Tropical Panamá,33,0.648,0.84,0.548,0,11,0.0852,-8.549,0.127,167.26
4293,Víctor Samaniego,Chalino Sanchez,30,0.585,0.746,0.481,0,8,0.0765,-8.409,0.0405,96.462
358,Motorcycle Mama,Sailcat,21,0.687,0.733,0.154,0,4,0.089,-24.336,0.0649,121.293
575,שיר המכולת,Kaveret,37,0.531,0.83,0.763,0,4,0.137,-9.581,0.0375,93.507
642,Walking The Back Streets And Crying,Albert King,27,0.49,0.704,0.352,0,3,0.13,-11.665,0.043,77.439
7996,Olvídala,Conjunto Atardecer,44,0.518,0.745,0.759,0,2,0.133,-3.782,0.0404,155.076


In [29]:
chart.sample(25)

Unnamed: 0,title,rank,date,artist,region,chart,streams
189052,Would You Ever,31,2017-08-10,"Skrillex, Poo Bear",Indonesia,viral50,
11498723,HUMBLE.,53,2017-08-31,Kendrick Lamar,Poland,top200,13681.0
12030141,"Sola (Remix) [feat. Daddy Yankee, Wisin, Farru...",8,2017-06-23,Anuel AA,Nicaragua,top200,1571.0
14375106,Tryabvash mi,23,2019-10-02,Preslava,Bulgaria,viral50,
24754526,Tu Foto Del DNI,36,2021-01-19,"Marmi, Aitana",Andorra,viral50,
1184942,Chained To The Rhythm,16,2017-02-19,"Katy Perry, Skip Marley",United States,top200,571739.0
2383278,In My Mind,174,2018-09-04,"Dynoro, Gigi D'Agostino",Argentina,top200,19501.0
16490278,Se Me Olvidó,81,2021-01-13,Christian Nodal,Nicaragua,top200,1474.0
16648734,Kafamda Kentsel Dönüşümler,14,2021-01-14,İkiye On Kala,Turkey,top200,101143.0
11836096,SWIM,14,2017-11-08,SIRUP,Japan,viral50,


In [42]:
merged = pd.merge(chart, test, on=['title', 'artist'])

In [44]:
merged.head()

Unnamed: 0,title,rank,date,artist,region,chart,streams,popularity,danceability,valence,energy,explicit,key,liveness,loudness,speechiness,tempo
0,Vacaciones,20,2017-01-01,Wisin,Argentina,top200,86103.0,67,0.777,0.704,0.911,0,6,0.257,-3.223,0.173,96.017
1,Vacaciones,13,2017-01-01,Wisin,Bolivia,top200,2747.0,67,0.777,0.704,0.911,0,6,0.257,-3.223,0.173,96.017
2,Vacaciones,14,2017-01-01,Wisin,Chile,top200,55053.0,67,0.777,0.704,0.911,0,6,0.257,-3.223,0.173,96.017
3,Vacaciones,27,2017-01-01,Wisin,Colombia,top200,13877.0,67,0.777,0.704,0.911,0,6,0.257,-3.223,0.173,96.017
4,Vacaciones,30,2017-01-01,Wisin,Costa Rica,top200,6797.0,67,0.777,0.704,0.911,0,6,0.257,-3.223,0.173,96.017


In [45]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406626 entries, 0 to 406625
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   title         406626 non-null  object 
 1   rank          406626 non-null  int64  
 2   date          406626 non-null  object 
 3   artist        406626 non-null  object 
 4   region        406626 non-null  object 
 5   chart         406626 non-null  object 
 6   streams       326801 non-null  float64
 7   popularity    406626 non-null  int64  
 8   danceability  406626 non-null  float64
 9   valence       406626 non-null  float64
 10  energy        406626 non-null  float64
 11  explicit      406626 non-null  int64  
 12  key           406626 non-null  int64  
 13  liveness      406626 non-null  float64
 14  loudness      406626 non-null  float64
 15  speechiness   406626 non-null  float64
 16  tempo         406626 non-null  float64
dtypes: float64(8), int64(4), object(5)
memory usage:

In [35]:
chart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26173514 entries, 0 to 26173513
Data columns (total 7 columns):
 #   Column   Dtype  
---  ------   -----  
 0   title    object 
 1   rank     int64  
 2   date     object 
 3   artist   object 
 4   region   object 
 5   chart    object 
 6   streams  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 1.4+ GB


In [46]:
merged.isna().sum()

title               0
rank                0
date                0
artist              0
region              0
chart               0
streams         79825
popularity          0
danceability        0
valence             0
energy              0
explicit            0
key                 0
liveness            0
loudness            0
speechiness         0
tempo               0
dtype: int64

In [41]:
test[(test == 'Van Halen').any(axis=1)]

Unnamed: 0,title,artist,popularity,danceability,valence,energy,explicit,key,liveness,loudness,speechiness,tempo
2779,Panama - 45 Version,Van Halen,22,0.525,0.465,0.974,0,8,0.0722,-5.797,0.0775,140.925
4471,Love Walks In - Live,Van Halen,34,0.502,0.414,0.744,0,9,0.909,-8.63,0.0274,88.398
4527,Judgement Day - Live,Van Halen,35,0.163,0.316,0.938,0,2,0.995,-7.181,0.0748,198.173
4574,You Really Got Me / Cabo Wabo - Live,Van Halen,34,0.21,0.257,0.902,0,9,0.398,-8.681,0.0589,88.109
4607,316 - Live,Van Halen,32,0.241,0.248,0.604,0,9,0.887,-17.086,0.0909,122.395
5058,Can't Stop Lovin' You,Van Halen,41,0.592,0.537,0.898,0,9,0.229,-5.194,0.0358,118.006
5204,Van Halen,Nerf Herder,33,0.454,0.657,0.778,0,7,0.293,-5.689,0.0487,137.419


In [48]:
merged[(merged == 'Ed Sheeran').any(axis=1)]

Unnamed: 0,title,rank,date,artist,region,chart,streams,popularity,danceability,valence,energy,explicit,key,liveness,loudness,speechiness,tempo
119347,The A Team,195,2018-03-02,Ed Sheeran,Australia,top200,24389.0,43,0.642,0.407,0.289,0,9,0.18,-9.918,0.0367,84.996
119348,The A Team,144,2017-02-01,Ed Sheeran,Belgium,top200,2861.0,43,0.642,0.407,0.289,0,9,0.18,-9.918,0.0367,84.996
119349,The A Team,163,2017-02-01,Ed Sheeran,Denmark,top200,8103.0,43,0.642,0.407,0.289,0,9,0.18,-9.918,0.0367,84.996
119350,The A Team,98,2017-02-01,Ed Sheeran,Ireland,top200,4472.0,43,0.642,0.407,0.289,0,9,0.18,-9.918,0.0367,84.996
119351,The A Team,152,2017-02-01,Ed Sheeran,Netherlands,top200,17599.0,43,0.642,0.407,0.289,0,9,0.18,-9.918,0.0367,84.996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120240,The A Team,132,2021-09-29,Ed Sheeran,United Kingdom,top200,55677.0,43,0.642,0.407,0.289,0,9,0.18,-9.918,0.0367,84.996
120241,The A Team,194,2021-08-21,Ed Sheeran,United Kingdom,top200,50530.0,43,0.642,0.407,0.289,0,9,0.18,-9.918,0.0367,84.996
120242,The A Team,112,2021-09-30,Ed Sheeran,Ireland,top200,6931.0,43,0.642,0.407,0.289,0,9,0.18,-9.918,0.0367,84.996
120243,The A Team,134,2021-09-30,Ed Sheeran,United Kingdom,top200,56242.0,43,0.642,0.407,0.289,0,9,0.18,-9.918,0.0367,84.996
