In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
sns.set_style("dark")

### Partition data

In [2]:
# Load dataset and preview
df = pd.read_csv("project_train.csv")

# Partition data set
train, test = train_test_split(df, test_size=0.2)

train_labels = train['Label']
test_labels = test['Label']

train = train.drop('Label', axis = 1)
test = test.drop('Label', axis = 1)


# Normalize data in sets
train = (train - train.min()) / (train.max() - train.min())
test = (test - test.min()) / (test.max() - test.min())

### k-Nearest Neighbour

In [22]:
# Apply k Nearest Neighbours to assign labels
features = train.columns
feature_combinations = sum([list(combinations(features, i)) for i in range(1,len(features))], [])

cross_val_accuracy, test_accuracy, feature_set = [], [], []

for feats in tqdm(feature_combinations):
    train_feats = train[list(feats)]
    test_feats = test[list(feats)]

    feature_set.append(feats)
    knn = KNeighborsClassifier(n_neighbors=32)
    knn.fit(train_feats, train_labels)

    # cross_validate
    cv_accuracy = cross_val_score(knn, train_feats, train_labels, cv=5)

    predicted_labels = knn.predict(test_feats)
    prediction_accuracy = np.sum(predicted_labels==test_labels) / len(test_labels)
    cross_val_accuracy.append(np.mean(cv_accuracy))
    test_accuracy.append(prediction_accuracy)

ind = np.argmax(cross_val_accuracy)

print("\nFeature set: ", feature_set[ind], "\nCV-Accuracy: ", cross_val_accuracy[ind], "\nTest Accuracy: ", test_accuracy[ind])

100%|██████████| 2046/2046 [01:33<00:00, 21.96it/s]
Feature set:  ('danceability', 'energy', 'key', 'speechiness', 'liveness', 'tempo') 
CV-Accuracy:  0.8056610056610058 
Test Accuracy:  0.7731958762886598



### Support Vector Machine

In [4]:
pipeline = []
pipeline.append(('rbf', svm.SVC()))
# pipeline.append(('linear', svm.SVC(kernel = 'linear')))
# pipeline.append(('poly', svm.SVC(kernel = 'poly')))

features = train.columns
feature_combinations = sum([list(combinations(features, i)) for i in range(1,len(features))], [])
cross_val_accuracy, test_accuracy, feature_set = [], [], []

for feats in tqdm(feature_combinations):
    train_feats = train[list(feats)]
    test_feats = test[list(feats)]

    for modelname, model in pipeline:
        feature_set.append([modelname, feats])
        cv_accuracy = cross_val_score(model, train_feats, train_labels, cv=5)
        cross_val_accuracy.append(np.mean(cv_accuracy))
        model.fit(train_feats, train_labels)
        predicted_labels = model.predict(test_feats)
        prediction_accuracy = np.sum(predicted_labels==test_labels) / len(test_labels)
        test_accuracy.append(prediction_accuracy)


ind = np.argmax(cross_val_accuracy)
print("\nFeature set: ", feature_set[ind], "\nCV-Accuracy: ", cross_val_accuracy[ind], "\nTest Accuracy: ", test_accuracy[ind])


100%|██████████| 2046/2046 [01:30<00:00, 22.51it/s]
Feature set:  ['rbf', ('danceability', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence')] 
CV-Accuracy:  0.8028971028971028 
Test Accuracy:  0.845360824742268



### Random Forest


In [27]:
forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=6)
cv_accuracy = cross_val_score(forest, train, train_labels, cv=5)
cross_val_accuracy = np.mean(cv_accuracy)

forest.fit(train, train_labels)
prediction = forest.predict(test)
prediction_accuracy = np.sum(prediction==test_labels) / len(test_labels)
print("\nCV-Accuracy: ", cross_val_accuracy, "\nTest Accuracy: ", prediction_accuracy)


CV-Accuracy:  0.8238761238761239 
Test Accuracy:  0.8551724137931035
