In [None]:
from pathlib import Path
import itertools

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

random_state = 2024

In [None]:
train_splits, test_splits = [], []
for split in range(5):
    train = pd.read_csv(f'./training_data/train_{split}.csv', index_col=['file', 'section'])
    test  = pd.read_csv(f'./training_data/test_{split}.csv',  index_col=['file', 'section'])

    train_splits.append(train)
    test_splits.append(test)

In [None]:
def predict_probabilities(clf, X_test):
    all_probabilities = clf.predict_proba(X_test)
    predicted_probabilities = np.max(all_probabilities, axis=1)
    predicted_classes = clf.classes_[np.argmax(all_probabilities, axis=1)]
    return predicted_classes, predicted_probabilities

In [None]:
Path('./predictions_rr/all').mkdir(parents=True, exist_ok=True)
num_combinations = 5 * 2 * 45 # 5 splits, full file and time decomposition, 45 genre pairs
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
i = 0
for split in range(5):
    for section in ['full', 'time_decomposition']:
        for genre_pair in itertools.combinations(genres, 2):
            i += 1
            print(f'now generating predictions for combination {i} of {num_combinations} - split {split}, method {section}, genres {genre_pair[0]} vs. {genre_pair[1]}                  ', end='\r')

            if section == 'full':
                ix = pd.IndexSlice[:, ['full']]
            else:
                ix = pd.IndexSlice[:, ['start', 'middle', 'end']]
            train = train_splits[split].loc[ix, :]
            test = test_splits[split].loc[ix, :]

            X_train = train[train['genre'].isin(genre_pair)]
            y_train = X_train['genre']
            X_train = X_train.drop(columns=['genre'])

            X_test = test.drop(columns=['genre'])
            y_test = test['genre']
            
            clf_decision_tree = DecisionTreeClassifier(random_state=random_state).fit(X_train, y_train)
            result_decision_tree = y_test.to_frame()
            result_decision_tree['predicted'], result_decision_tree['probability'] = predict_probabilities(clf_decision_tree, X_test)
            result_decision_tree.to_csv(f'./predictions_rr/all/decision_tree_rr_{split}_{section}_{genre_pair[0]}_{genre_pair[1]}.csv')
                
            clf_random_forest = RandomForestClassifier(n_estimators=100, random_state=random_state).fit(X_train, y_train)
            result_random_forest = y_test.to_frame()
            result_random_forest['predicted'], result_random_forest['probability'] = predict_probabilities(clf_random_forest, X_test)
            result_random_forest.to_csv(f'./predictions_rr/all/random_forest_rr_{split}_{section}_{genre_pair[0]}_{genre_pair[1]}.csv')

            clf_knn = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
            result_knn = y_test.to_frame()
            result_knn['predicted'], result_knn['probability'] = predict_probabilities(clf_knn, X_test)
            result_knn.to_csv(f'./predictions_rr/all/knn_rr_{split}_{section}_{genre_pair[0]}_{genre_pair[1]}.csv')

            clf_naive_bayes = GaussianNB().fit(X_train, y_train)
            result_naive_bayes = y_test.to_frame()
            result_naive_bayes['predicted'], result_naive_bayes['probability'] = predict_probabilities(clf_naive_bayes, X_test)
            result_naive_bayes.to_csv(f'./predictions_rr/all/naive_bayes_rr_{split}_{section}_{genre_pair[0]}_{genre_pair[1]}.csv')

            clf_mlp = MLPClassifier(hidden_layer_sizes=(30,), batch_size=X_train.shape[0], max_iter=5000, random_state=random_state).fit(X_train, y_train)
            result_mlp = y_test.to_frame()
            result_mlp['predicted'], result_mlp['probability'] = predict_probabilities(clf_mlp, X_test)
            result_mlp.to_csv(f'./predictions_rr/all/mlp_rr_{split}_{section}_{genre_pair[0]}_{genre_pair[1]}.csv')

            clf_svm = SVC(random_state=random_state).fit(X_train, y_train)
            result_svm = y_test.to_frame()
            result_svm['predicted'] = clf_svm.predict(X_test)
            result_svm['score'] = clf_svm.decision_function(X_test)
            assert not np.logical_and((result_svm['predicted'] == genre_pair[0]), (result_svm['score'] > 0)).any() # ensure that first genre has scores below 0
            result_svm.to_csv(f'./predictions_rr/all/svm_rr_{split}_{section}_{genre_pair[0]}_{genre_pair[1]}.csv')
print('\ndone!')