In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#from fsspec.registry import known_implementations
#from fs import open_fs
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# from tensorflow.keras.utils import to_categorical

In [6]:
spotify_df = pd.read_csv('../Resources/tracks.csv')
spotify_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [7]:
#  Remove features that create noise and result in improved model
removed_df = spotify_df.drop(columns=['id', 'duration_ms', 'id_artists', 'key', 'mode', 'time_signature','valence', 'tempo','liveness'])
# Drop the null columns where all values are null
spotify_df = removed_df.dropna(axis='columns', how='all')
# Drop the null rows
cleaned_df = removed_df.dropna()
cleaned_df.describe()

Unnamed: 0,popularity,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness
count,586601.0,586601.0,586601.0,586601.0,586601.0,586601.0,586601.0,586601.0
mean,27.573212,0.044091,0.563612,0.542071,-10.205789,0.10487,0.449803,0.113425
std,18.369417,0.205298,0.166101,0.25191,5.089422,0.179902,0.348812,0.266843
min,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0
25%,13.0,0.0,0.453,0.343,-12.891,0.034,0.0969,0.0
50%,27.0,0.0,0.577,0.549,-9.242,0.0443,0.422,2.4e-05
75%,41.0,0.0,0.686,0.748,-6.481,0.0763,0.784,0.00955
max,100.0,1.0,0.991,1.0,5.376,0.971,0.996,1.0


In [8]:
#Review the correlation of popularity to the other attributes
#popularity is our dependent variable for regression (x-value)
pop_corr = cleaned_df.corr()["popularity"]

In [9]:
#This provides the r-value of correlation
pop_corr

popularity          1.000000
explicit            0.211749
danceability        0.186878
energy              0.302178
loudness            0.327001
speechiness        -0.047415
acousticness       -0.370723
instrumentalness   -0.236403
Name: popularity, dtype: float64

In [10]:
#This computes the r-squared value. 
#note: in this example, the higher r-squared value would indicate that the higher the acousticness level, the less popular the song is
pop_r2 = pop_corr*pop_corr

In [11]:
pop_r2

popularity          1.000000
explicit            0.044838
danceability        0.034923
energy              0.091312
loudness            0.106930
speechiness         0.002248
acousticness        0.137436
instrumentalness    0.055886
Name: popularity, dtype: float64

In [12]:
#Temporarily removed popularity column to visualize correlations between other variables easier.  Looking to see if any of the variables is a predictor of another variable (highly correlated).
remove_pop = cleaned_df.drop(columns=['popularity'])
remove_pop.head()

Unnamed: 0,name,explicit,artists,release_date,danceability,energy,loudness,speechiness,acousticness,instrumentalness
0,Carve,0,['Uli'],1922-02-22,0.645,0.445,-13.338,0.451,0.674,0.744
1,Capítulo 2.16 - Banquero Anarquista,0,['Fernando Pessoa'],1922-06-01,0.695,0.263,-22.136,0.957,0.797,0.0
2,Vivo para Quererte - Remasterizado,0,['Ignacio Corsini'],1922-03-21,0.434,0.177,-21.18,0.0512,0.994,0.0218
3,El Prisionero - Remasterizado,0,['Ignacio Corsini'],1922-03-21,0.321,0.0946,-27.961,0.0504,0.995,0.918
4,Lady of the Evening,0,['Dick Haymes'],1922,0.402,0.158,-16.9,0.039,0.989,0.13


In [13]:
#Testing different correlation methods to see which one offers the best analysis
var_corr1 = remove_pop.corr(method='kendall')
var_corr1

Unnamed: 0,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness
explicit,1.0,0.122225,0.101236,0.122786,0.173292,-0.117415,-0.08608
danceability,0.122225,1.0,0.145264,0.130604,0.151689,-0.13783,-0.15338
energy,0.101236,0.145264,1.0,0.577148,0.107438,-0.525178,-0.08698
loudness,0.122786,0.130604,0.577148,1.0,0.012216,-0.367519,-0.183678
speechiness,0.173292,0.151689,0.107438,0.012216,1.0,-0.02057,-0.075904
acousticness,-0.117415,-0.13783,-0.525178,-0.367519,-0.02057,1.0,0.073397
instrumentalness,-0.08608,-0.15338,-0.08698,-0.183678,-0.075904,0.073397,1.0


In [14]:
#Testing different correlation methods to see which one offers the best analysis
var_corr2 = remove_pop.corr(method='pearson')
var_corr2

Unnamed: 0,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness
explicit,1.0,0.150216,0.12306,0.134598,0.102251,-0.149001,-0.06751
danceability,0.150216,1.0,0.241464,0.251394,0.199291,-0.242838,-0.225831
energy,0.12306,0.241464,1.0,0.764744,-0.05356,-0.715366,-0.195727
loudness,0.134598,0.251394,0.764744,1.0,-0.16714,-0.519423,-0.329255
speechiness,0.102251,0.199291,-0.05356,-0.16714,1.0,0.069121,-0.102425
acousticness,-0.149001,-0.242838,-0.715366,-0.519423,0.069121,1.0,0.204312
instrumentalness,-0.06751,-0.225831,-0.195727,-0.329255,-0.102425,0.204312,1.0


In [15]:
#Testing different correlation methods to see which one offers the best analysis
var_corr3 = remove_pop.corr(method='spearman')
var_corr3

Unnamed: 0,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness
explicit,1.0,0.149568,0.123922,0.150376,0.212058,-0.143744,-0.100946
danceability,0.149568,1.0,0.216573,0.193832,0.23417,-0.202743,-0.219107
energy,0.123922,0.216573,1.0,0.77103,0.167386,-0.718286,-0.125822
loudness,0.150376,0.193832,0.77103,1.0,0.021027,-0.5289,-0.258086
speechiness,0.212058,0.23417,0.167386,0.021027,1.0,-0.037929,-0.111726
acousticness,-0.143744,-0.202743,-0.718286,-0.5289,-0.037929,1.0,0.110663
instrumentalness,-0.100946,-0.219107,-0.125822,-0.258086,-0.111726,0.110663,1.0


In [17]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 586601 entries, 0 to 586671
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   name              586601 non-null  object 
 1   popularity        586601 non-null  int64  
 2   explicit          586601 non-null  int64  
 3   artists           586601 non-null  object 
 4   release_date      586601 non-null  object 
 5   danceability      586601 non-null  float64
 6   energy            586601 non-null  float64
 7   loudness          586601 non-null  float64
 8   speechiness       586601 non-null  float64
 9   acousticness      586601 non-null  float64
 10  instrumentalness  586601 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 53.7+ MB


In [18]:
y = cleaned_df.loc[:, 'acousticness']
X = cleaned_df.drop(['name','artists','release_date','acousticness'],axis=1)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=100)

In [20]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model

LinearRegression()

In [29]:
lin_model = LinearRegression()
lin_model.fit(X_test, y_test)
lin_acc = lin_model.score(X_test, y_test)
print(lin_acc)

0.5538679981437021


In [30]:
print(lin_model.score(X_train,y_train))

0.5539426012504156


In [31]:
y = cleaned_df.loc[:, 'popularity']
X = cleaned_df.drop(['name','artists','release_date','popularity'],axis=1)

In [32]:
lin_model = LinearRegression()
lin_model.fit(X_test, y_test)
lin_acc = lin_model.score(X_test, y_test)
print(lin_acc)

0.5538679981437021


In [33]:
print(lin_model.score(X_train,y_train))

0.5539426012504156


In [40]:
y = cleaned_df.loc[:, 'energy']
X = cleaned_df.drop(['name','artists','release_date','energy'],axis=1)

In [38]:
lin_model = LinearRegression()
lin_model.fit(X_test, y_test)
lin_acc = lin_model.score(X_test, y_test)
print(lin_acc)

0.5538679981437021


In [39]:
print(lin_model.score(X_train,y_train))

0.5539426012504156
