In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

random_state = 2024

In [None]:
X_train_splits, X_test_splits, y_train_splits, y_test_splits = [], [], [], []
for split in range(5):
    train = pd.read_csv(f'./training_data/train_{split}.csv', index_col=['file', 'section'])
    test  = pd.read_csv(f'./training_data/test_{split}.csv',  index_col=['file', 'section'])

    X_train_splits.append(train.drop(columns='genre'))
    X_test_splits.append(test.drop(columns='genre'))
    y_train_splits.append(train['genre'])
    y_test_splits.append(test['genre'])

In [None]:
def predict_probabilities_oaa(clf, X_test):
    all_probabilities = clf.predict_proba(X_test)
    true_index = list(clf.classes_).index(True)
    predicted_probabilities = all_probabilities[:,1]
    predicted_classes = clf.classes_[np.argmax(all_probabilities, axis=1)]
    return predicted_classes, predicted_probabilities

In [None]:
Path('./predictions_oaa/all').mkdir(parents=True, exist_ok=True)
num_combinations = 5 * 2 * 10 # 5 splits, full file and time decomposition, 10 genres
i = 0
for split in range(5):
    for section in ['full', 'time_decomposition']:
        for genre in ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']:
            i += 1
            print(f'now generating predictions for combination {i} of {num_combinations} - split {split}, method {section}, genre {genre}                 ', end='\r')
            
            if section == 'full':
                ix = pd.IndexSlice[:, ['full']]
            else:
                ix = pd.IndexSlice[:, ['start', 'middle', 'end']]
            X_train = X_train_splits[split].loc[ix, :]
            X_test = X_test_splits[split].loc[ix, :]
            y_train = y_train_splits[split].loc[ix]
            y_test = y_test_splits[split].loc[ix]

            # relabel samples for oaa - belongs to genre x vs. does not belong to genre x
            y_train_oaa = (y_train == genre)
            y_test_oaa = (y_test == genre)
            
            clf_decision_tree = DecisionTreeClassifier(random_state=random_state).fit(X_train, y_train_oaa)
            result_decision_tree = y_test.to_frame()
            result_decision_tree['expected'] = y_test_oaa
            result_decision_tree['predicted'], result_decision_tree['probability_true'] = predict_probabilities_oaa(clf_decision_tree, X_test)
            result_decision_tree.to_csv(f'./predictions_oaa/all/decision_tree_oaa_{split}_{section}_{genre}.csv')
                
            clf_random_forest = RandomForestClassifier(n_estimators=100, random_state=random_state).fit(X_train, y_train_oaa)
            result_random_forest = y_test.to_frame()
            result_random_forest['expected'] = y_test_oaa
            result_random_forest['predicted'], result_random_forest['probability_true'] = predict_probabilities_oaa(clf_random_forest, X_test)
            result_random_forest.to_csv(f'./predictions_oaa/all/random_forest_oaa_{split}_{section}_{genre}.csv')

            clf_knn = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train_oaa)
            result_knn = y_test.to_frame()
            result_knn['expected'] = y_test_oaa
            result_knn['predicted'], result_knn['probability_true'] = predict_probabilities_oaa(clf_knn, X_test)
            result_knn.to_csv(f'./predictions_oaa/all/knn_oaa_{split}_{section}_{genre}.csv')

            clf_naive_bayes = GaussianNB().fit(X_train, y_train_oaa)
            result_naive_bayes = y_test.to_frame()
            result_naive_bayes['expected'] = y_test_oaa
            result_naive_bayes['predicted'], result_naive_bayes['probability_true'] = predict_probabilities_oaa(clf_naive_bayes, X_test)
            result_naive_bayes.to_csv(f'./predictions_oaa/all/naive_bayes_oaa_{split}_{section}_{genre}.csv')

            clf_mlp = MLPClassifier(hidden_layer_sizes=(30,), batch_size=X_train.shape[0], max_iter=5000, random_state=random_state).fit(X_train, y_train_oaa)
            result_mlp = y_test.to_frame()
            result_mlp['expected'] = y_test_oaa
            result_mlp['predicted'], result_mlp['probability_true'] = predict_probabilities_oaa(clf_mlp, X_test)
            result_mlp.to_csv(f'./predictions_oaa/all/mlp_oaa_{split}_{section}_{genre}.csv')

            clf_svm = SVC(random_state=random_state).fit(X_train, y_train_oaa)
            result_svm = y_test.to_frame()
            result_svm['expected'] = y_test_oaa
            result_svm['predicted'] = clf_svm.predict(X_test)
            result_svm['score'] = clf_svm.decision_function(X_test)
            assert (result_svm['predicted'] == (result_svm['score'] >= 0)).all() # ensure that False has scores below 0
            result_svm.to_csv(f'./predictions_oaa/all/svm_oaa_{split}_{section}_{genre}.csv')
print('\ndone!')