In [1]:
import pandas as pd
from tqdm import tqdm
import os
import re
from math import sqrt
import matplotlib.pyplot as plt

from statsmodels.tsa.api import Holt, ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error

In [2]:
def load_csv_file(filename):
    try:
        if not os.path.exists(filename):
            print(f"The file: {filename} doesn't exist")
        else:
            print("Found File")
            return pd.concat([chunk for chunk in tqdm(pd.read_csv(filename, chunksize=1000), desc=f'Loading {filename}')])
    except:
        print("Didn't Work! :(")

# Spotify + Hot 100

## Poularity & Chart Position

https://www.kaggle.com/datasets/paradisejoy/top-hits-spotify-from-20002019

https://github.com/HipsterVizNinja/random-data/blob/main/Music/hot-100/Hot%20100.csv

> # Notes

Chart position: create a feature that represents the average chart position for each song in the Billboard Hot 100 dataset.

Peak position: create a feature that represents the peak position for each song in the Billboard Hot 100 dataset.

Times on chart: create a feature that represents the number of times each song appeared on the Billboard Hot 100 chart.

Consecutive weeks: create a feature that represents the number of consecutive weeks each song appeared on the Billboard Hot 100 chart.

Popularity: use the "popularity" feature from the Spotify dataset to represent the overall popularity of each song.

Genre: use the "genre" feature from the Spotify dataset to represent the genre of each song.

Danceability: use the "danceability" feature from the Spotify dataset to represent how suitable each song is for dancing.

Energy: use the "energy" feature from the Spotify dataset to represent the overall energy of each song.

Popularity and chart position: create a new feature that represents the product of the popularity and the average chart position for each song. This feature could capture the relationship between popularity and chart position.

Popularity and times on chart: create a new feature that represents the product of the popularity and the number of times each song appeared on the Billboard Hot 100 chart. This feature could capture the relationship between popularity and how frequently the song has been on the chart.

Popularity and peak position: create a new feature that represents the product of the popularity and the peak position for each song. This feature could capture the relationship between popularity and the highest position the song has achieved on the chart.

Popularity and average of positions: create a new feature that represents the product of the popularity and the average of all the positions the song achieved on the chart. This feature could capture the relationship between popularity and the average performance of the song on the chart.

In [3]:
df = load_csv_file('songs_normalize.csv')

Found File


Loading songs_normalize.csv: 2it [00:00, 149.77it/s]


In [5]:
df.shape

(2000, 18)

In [4]:
df.head()

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,pop
1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726,"rock, pop"
2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859,"pop, country"
3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,1.3e-05,0.347,0.544,119.992,"rock, metal"
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656,pop


In [37]:
# Popularity Range

In [35]:
df['popularity'].min()

0

In [36]:
df['popularity'].max()

89

In [7]:
hot_100_df = load_csv_file("hot_100.csv")

Found File


Loading hot_100.csv: 337it [00:00, 456.65it/s]


In [10]:
hot_100_df.head(1)

Unnamed: 0,chart_position,chart_date,song,performer,song_id,instance,time_on_chart,consecutive_weeks,previous_week,peak_position,worst_position,chart_debut,chart_url
0,84,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless",1.0,1,,,84,84,1990-05-05,https://www.billboard.com/charts/hot-100/1990-...


In [11]:
# set datetime data type and sort
hot_100_df['chart_date']=pd.to_datetime(hot_100_df['chart_date'])

In [12]:
hot_100_df.head(1)

Unnamed: 0,chart_position,chart_date,song,performer,song_id,instance,time_on_chart,consecutive_weeks,previous_week,peak_position,worst_position,chart_debut,chart_url
0,84,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless",1.0,1,,,84,84,1990-05-05,https://www.billboard.com/charts/hot-100/1990-...


In [13]:
# set 'date' column as index and sorted
hot_100_df = hot_100_df.set_index("chart_date").sort_index()

In [15]:
hot_100_df["performer"]

Unnamed: 0_level_0,chart_position,song,performer,song_id,instance,time_on_chart,consecutive_weeks,previous_week,peak_position,worst_position,chart_debut,chart_url
chart_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1958-08-02,69,Op,The Honeycones,OpThe Honeycones,1.0,1,,,69,69,1958-08-02,https://www.billboard.com/charts/hot-100/1958-...
1958-08-02,33,The Freeze,Tony And Joe,The FreezeTony And Joe,1.0,1,,,33,33,1958-08-02,https://www.billboard.com/charts/hot-100/1958-...
1958-08-02,21,Guess Things Happen That Way,Johnny Cash And The Tennessee Two,Guess Things Happen That WayJohnny Cash And Th...,1.0,1,,,21,21,1958-08-02,https://www.billboard.com/charts/hot-100/1958-...


In [33]:
hot_100_df[(hot_100_df['performer'].str.contains('santana',case=False))].loc['2000-01-01':'2000-12-31']

Unnamed: 0_level_0,chart_position,song,performer,song_id,instance,time_on_chart,consecutive_weeks,previous_week,peak_position,worst_position,chart_debut,chart_url
chart_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2000-01-01,1,Smooth,Santana Featuring Rob Thomas,SmoothSantana Featuring Rob Thomas,1.0,23,22.0,1.0,1,78,1999-07-31,https://www.billboard.com/charts/hot-100/2000-...
2000-01-08,1,Smooth,Santana Featuring Rob Thomas,SmoothSantana Featuring Rob Thomas,1.0,24,23.0,1.0,1,78,1999-07-31,https://www.billboard.com/charts/hot-100/2000-...
2000-01-15,2,Smooth,Santana Featuring Rob Thomas,SmoothSantana Featuring Rob Thomas,1.0,25,24.0,1.0,1,78,1999-07-31,https://www.billboard.com/charts/hot-100/2000-...
2000-01-22,3,Smooth,Santana Featuring Rob Thomas,SmoothSantana Featuring Rob Thomas,1.0,26,25.0,2.0,1,78,1999-07-31,https://www.billboard.com/charts/hot-100/2000-...
2000-01-29,3,Smooth,Santana Featuring Rob Thomas,SmoothSantana Featuring Rob Thomas,1.0,27,26.0,3.0,1,78,1999-07-31,https://www.billboard.com/charts/hot-100/2000-...
...,...,...,...,...,...,...,...,...,...,...,...,...
2000-08-05,47,Maria Maria,Santana Featuring The Product G&B,Maria MariaSantana Featuring The Product G&B,1.0,26,25.0,48.0,1,48,2000-02-12,https://www.billboard.com/charts/hot-100/2000-...
2000-08-12,36,Smooth,Santana Featuring Rob Thomas,SmoothSantana Featuring Rob Thomas,1.0,55,54.0,39.0,1,78,1999-07-31,https://www.billboard.com/charts/hot-100/2000-...
2000-08-19,43,Smooth,Santana Featuring Rob Thomas,SmoothSantana Featuring Rob Thomas,1.0,56,55.0,36.0,1,78,1999-07-31,https://www.billboard.com/charts/hot-100/2000-...
2000-08-26,46,Smooth,Santana Featuring Rob Thomas,SmoothSantana Featuring Rob Thomas,1.0,57,56.0,43.0,1,78,1999-07-31,https://www.billboard.com/charts/hot-100/2000-...


In [22]:
hot_100_df.loc['2000-01-08':'2000-01-10'].sort_values('chart_position')

Unnamed: 0_level_0,chart_position,song,performer,song_id,instance,time_on_chart,consecutive_weeks,previous_week,peak_position,worst_position,chart_debut,chart_url
chart_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2000-01-08,1,Smooth,Santana Featuring Rob Thomas,SmoothSantana Featuring Rob Thomas,1.0,24,23.0,1.0,1,78,1999-07-31,https://www.billboard.com/charts/hot-100/2000-...
2000-01-08,2,Back At One,Brian McKnight,Back At OneBrian McKnight,1.0,20,19.0,2.0,2,75,1999-08-28,https://www.billboard.com/charts/hot-100/2000-...
2000-01-08,3,I Wanna Love You Forever,Jessica Simpson,I Wanna Love You ForeverJessica Simpson,1.0,13,12.0,3.0,3,69,1999-10-16,https://www.billboard.com/charts/hot-100/2000-...
2000-01-08,4,My Love Is Your Love,Whitney Houston,My Love Is Your LoveWhitney Houston,1.0,19,18.0,4.0,4,81,1999-09-04,https://www.billboard.com/charts/hot-100/2000-...
2000-01-08,5,Hot Boyz,"Missy ""Misdemeanor"" Elliott Featuring NAS, EVE...","Hot BoyzMissy ""Misdemeanor"" Elliott Featuring ...",1.0,7,6.0,7.0,5,36,1999-11-27,https://www.billboard.com/charts/hot-100/2000-...
...,...,...,...,...,...,...,...,...,...,...,...,...
2000-01-08,96,Larger Than Life,Backstreet Boys,Larger Than LifeBackstreet Boys,1.0,17,16.0,87.0,25,96,1999-09-18,https://www.billboard.com/charts/hot-100/2000-...
2000-01-08,97,The Best Man I Can Be,"Ginuwine, R.L., Tyrese, Case","The Best Man I Can BeGinuwine, R.L., Tyrese, Case",1.0,1,,,97,97,2000-01-08,https://www.billboard.com/charts/hot-100/2000-...
2000-01-08,98,L.A. Song,Beth Hart,L.A. SongBeth Hart,1.0,7,6.0,96.0,96,100,1999-11-27,https://www.billboard.com/charts/hot-100/2000-...
2000-01-08,99,Re-arranged,Limp Bizkit,Re-arrangedLimp Bizkit,1.0,6,5.0,95.0,90,99,1999-12-04,https://www.billboard.com/charts/hot-100/2000-...


# XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 

In [None]:
xgb_model = XGBClassifier(objective = "binary:logistic", n_estimators = 10, seed = 123)
xgb_model.fit(X_train, y_train)
xgb_predict = xgb_model.predict(X_validate)
xgb_accuracy = accuracy_score(y_validate, xgb_predict)
print("Accuracy: " + str(xgb_accuracy))