In [1]:
from path import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

data = Path('../Resources/data.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [2]:
# filter songs for the last 10 years
df = df[df["year"]>2010]
df.shape

(26628, 19)

In [3]:
# Seperate dataset into features (X) and target (y)
y=df["popularity"]
X=df.drop(columns=["popularity","id", "explicit", "name", "mode", "key", "release_date", "year", "artists"])

In [4]:
# confirm feature size
X.shape

(26628, 10)

In [5]:
# split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1)
X_train.shape

(19971, 10)

In [6]:
# Scaling data for faster training time
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Import SVC model from Scikit-learn. Orientation of the hyperplane is linear. Increased # of Iterations because of the large size. 
# Might need to use linearSVM. Scales for larger data sets. https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
from sklearn.svm import LinearSVC
model = LinearSVC(max_iter=100000, verbose = True)

In [8]:
# Train the model
model.fit(X_train_scaled, y_train)

[LibLinear]

LinearSVC(max_iter=100000, verbose=True)

In [9]:
# Create precictions with the model
y_pred = model.predict(X_test)
results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,70
3,0,8
4,0,0


In [10]:
# Asses acuraccy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.3485053327324621

In [11]:
# Generate confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.35      1.00      0.52      2320
           1       0.00      0.00      0.00       245
           2       0.00      0.00      0.00       176
           3       0.00      0.00      0.00       147
           4       0.00      0.00      0.00       137
           5       0.00      0.00      0.00       101
           6       0.00      0.00      0.00        76
           7       0.00      0.00      0.00        78
           8       0.00      0.00      0.00        62
           9       0.00      0.00      0.00        56
          10       0.00      0.00      0.00        52
          11       0.00      0.00      0.00        57
          12       0.00      0.00      0.00        71
          13       0.00      0.00      0.00        48
          14       0.00      0.00      0.00        56
          15       0.00      0.00      0.00        42
          16       0.00      0.00      0.00        44
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
