In [145]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import music21
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import average_precision_score, f1_score, confusion_matrix, fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import precision_recall_fscore_support, classification_report

In [146]:
# Construct the dataset using the pickle files obtained in feature_extraction.ipynb. This is done in order
# not to run the part extracting the features every time. 

X_train = pd.read_pickle(r'data/X_train.pickle')
X_validation = pd.read_pickle(r'data/X_validation.pickle')
X_test = pd.read_pickle(r'data/X_test.pickle')
y_train = pd.read_pickle(r'data/y_train.pickle')
y_validation = pd.read_pickle(r'data/y_validation.pickle')
y_test = pd.read_pickle(r'data/y_test.pickle')

In [147]:
# Distribute the validation set, that was present initially, between train and test sets

X_test = np.concatenate((X_test, X_validation[:int(len(X_validation) / 2)]))
y_test = np.concatenate((y_test, y_validation[:int(len(X_validation) / 2)]))
X_train = np.concatenate((X_train, X_validation[int((len(X_validation) / 2) + 1):]))
y_train = np.concatenate((y_train, y_validation[int((len(X_validation) / 2) + 1):]))

In [148]:
# Remove constant features

vt = VarianceThreshold(threshold=0)
vt.fit(X_train)
X_train = vt.transform(X_train)
X_test = vt.transform(X_test)

In [100]:
# Remove highly correlated features
# Based on the code at https://www.dezyre.com/recipes/drop-out-highly-correlated-features-in-python

df = pd.DataFrame(X_train)
correlation_matrix = df.corr()
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column].abs() > 0.9)]
df1 = df.drop(df.columns[to_drop], axis=1)
X_train = df1.to_numpy()

df = pd.DataFrame(X_test)
df = df.drop(df.columns[to_drop], axis=1)

X_test = df.to_numpy()
X_test.shape

(246, 122)

In [149]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1035, 125)
(1035,)
(246, 125)
(246,)


In [150]:
# Scale the dataset, since it is necessary for some of the models used

ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [151]:
# Define a function to calculate the ordinal classification index, given the true and predicted labels
# Based on this source (https://www.researchgate.net/publication/220360337_Measuring_the_Performance_of_Ordinal_Classification)

def ordinal_classification_index(y, y_pred):
    cMatrix = confusion_matrix(y, y_pred)
    k = len(cMatrix)
    n = np.sum(cMatrix)
    gamma = 1
    beta = 0.75 / (n * (k - 1)**gamma)
    helperM2 = np.zeros((k, k))
    
    for r in range(0, k):
        for c in range(0, k):
            helperM2[r, c] = cMatrix[r, c] * ((np.abs(r - c))**gamma)
            
    total_dispersion = (np.sum(helperM2)**(1 / gamma))
    helperM1 = cMatrix / (total_dispersion + n)
    err_matrix = np.zeros((k, k))
    err_matrix[0, 0] = 1 - helperM1[0, 0] + beta * helperM2[0, 0]
    for r in range(1, k):
        c = 0
        err_matrix[r, c] = err_matrix[r - 1, c] - helperM1[r, c] + beta * helperM2[r, c]

    for c in range(1, k):
        r = 0
        err_matrix[r, c] = err_matrix[r, c - 1] - helperM1[r, c] + beta * helperM2[r, c]
    
    for c in range(1, k):
        for r in range(1, k):
            cost_up = err_matrix[r - 1, c]
            cost_left = err_matrix[r, c - 1]
            left_top_cost = err_matrix[r - 1, c - 1]
            aux = np.min([cost_up, cost_left, left_top_cost])
            err_matrix[r, c] = aux - helperM1[r, c] + beta * helperM2[r, c]
       
    oc = err_matrix[-1, -1]

    return oc

In [105]:
oci = make_scorer(ordinal_classification_index, greater_is_better=False)

In [107]:
grid = {"n_neighbors": range(1, 300)}
model = KNeighborsClassifier()
gs = GridSearchCV(model, param_grid=grid, verbose=0, scoring=oci)
gs.fit(X_train, y_train)
knn_best_params = gs.best_params_
knn = gs.best_estimator_

In [108]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=39)

In [143]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

LinearDiscriminantAnalysis()

In [110]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)



QuadraticDiscriminantAnalysis()

In [139]:
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB()

In [67]:
grid = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 4, 6],
    'max_depth': [10, 30, 50],
    'max_features': [2, 3],
    'n_estimators': [100, 200, 500],
    'random_state': range(0, 10)
}

rf = RandomForestClassifier()

gs = GridSearchCV(estimator=rf, param_grid=grid, cv=3)

gs.fit(X_train, y_train)

rf = gs.best_estimator_

In [74]:
grid = {
    "C": [0.001, 0.01, 1, 10, 100, 1000], 
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel':['linear', 'rbf','poly', 'sigmoid']
}

md = SVC()
gs = GridSearchCV(md, param_grid=grid,verbose=0, scoring=oci)
gs.fit(X_train, y_train)
svc_best_params = gs.best_params_
svc = gs.best_estimator_

In [152]:
svc.fit(X_train, y_train)

SVC(C=1, gamma=0.01)

In [127]:
models = {
    "K Nearest Neighbors": knn,
    "LDA": lda,
    "QDA": qda,
    "GaussianNB": nb,
    "SVC": svc,
    "Random Forest": rf
}

In [128]:
# Get the classification reports

for name, model in models.items():
    pred = model.predict(X_test)
    print()
    print(name)
    print(classification_report(y_pred=pred, y_true=y_test))
    print()
    print("OCI:", ordinal_classification_index(y_test, pred), '\n\n')


K Nearest Neighbors
              precision    recall  f1-score   support

           0       0.77      0.67      0.71        30
           1       0.50      0.45      0.47        38
           2       0.80      0.91      0.85       164
           3       0.00      0.00      0.00        14

    accuracy                           0.76       246
   macro avg       0.52      0.51      0.51       246
weighted avg       0.71      0.76      0.73       246


OCI: 0.3230736367873233 



LDA
              precision    recall  f1-score   support

           0       0.43      0.63      0.51        30
           1       0.44      0.53      0.48        38
           2       0.79      0.65      0.71       164
           3       0.10      0.14      0.11        14

    accuracy                           0.60       246
   macro avg       0.44      0.49      0.46       246
weighted avg       0.65      0.60      0.62       246


OCI: 0.5357204635876146 



QDA
              precision    recall  f1-score

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [129]:
# Sort the models according to the OCI

table = pd.DataFrame(columns=["name", "OCI"])

for name, model in models.items():
    pred = model.predict(X_test)
    table = table.append({"name": name,
                          "OCI": ordinal_classification_index(model.predict(X_test), y_test)},
                           ignore_index=True
                        )

table.sort_values("OCI")

Unnamed: 0,name,OCI
5,Random Forest,0.247311
4,SVC,0.296869
0,K Nearest Neighbors,0.323074
2,QDA,0.420632
3,GaussianNB,0.440295
1,LDA,0.53572
