# Some imports and functions to split the data

In [60]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

def load_spotify_data():
    return pd.read_csv(os.path.join('data', '278_labelled_uri_train.csv'))

def split_data(df):
    spotify_train, spotify_test = train_test_split(df, test_size=0.2, random_state=42)
    spotify_train_features = spotify_train.drop('labels', axis=1)
    spotify_train_label = spotify_train['labels']
    spotify_test_features = spotify_test.drop('labels', axis=1)
    spotify_test_label = spotify_test['labels']
    return spotify_train_features, spotify_train_label, spotify_test_features, spotify_test_label

def rename_labels(df):
    df.loc[df == 0] = '0 (sad)'
    df.loc[df == 1] = '1 (happy)'
    df.loc[df == 2] = '2 (energetic)'
    df.loc[df == 3] = '3 (calm)'
    return df

def eval(model, X_train, X_val, y_train=y_train, y_val=y_val):
    print('TEST')
    y_val_pred = model.predict(X_val)
    print(confusion_matrix(y_val, y_val_pred))
    print(classification_report(y_val, y_val_pred))
    print("-------------------------------------------------------")
    print('TRAIN')
    y_train_pred = model.predict(X_train)
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

def drop_columns(cols_to_drop, df_train, df_val):
    df_train.drop(columns=cols_to_drop, inplace=True)
    df_val.drop(columns=cols_to_drop, inplace=True)


# Load the data and split it

In [64]:
X_train, y_train, X_val, y_val = split_data(load_spotify_data())
rename_labels(y_train)
rename_labels(y_val)

# columns_to_drop = ["uri", "Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.2", "duration (ms)", "speechiness", "liveness", "tempo", "spec_rate"]

drop_columns(["uri", "Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.2", "duration (ms)", "speechiness", "liveness", "tempo", "spec_rate"], X_train, X_val)
# drop_columns(["uri", "Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.2"], X_train, X_val)



# First try with a SDGClassifier

In [65]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=0.01))

clf.fit(X_train, y_train)

eval(clf, X_train, X_val, y_train, y_val)

TEST
[[ 9188  2353   563  1034]
 [ 1642 14390   892   196]
 [  134  1988  5450    30]
 [  109    64    15  6422]]
               precision    recall  f1-score   support

      0 (sad)       0.83      0.70      0.76     13138
    1 (happy)       0.77      0.84      0.80     17120
2 (energetic)       0.79      0.72      0.75      7602
     3 (calm)       0.84      0.97      0.90      6610

     accuracy                           0.80     44470
    macro avg       0.80      0.81      0.80     44470
 weighted avg       0.80      0.80      0.79     44470

-------------------------------------------------------
TRAIN
[[36800  9390  2260  3998]
 [ 6155 57367  3720   763]
 [  534  8068 21385    88]
 [  398   257    66 26631]]
               precision    recall  f1-score   support

      0 (sad)       0.84      0.70      0.76     52448
    1 (happy)       0.76      0.84      0.80     68005
2 (energetic)       0.78      0.71      0.74     30075
     3 (calm)       0.85      0.97      0.91     27