In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

random_state = 2024

In [None]:
X_train_splits, X_test_splits, y_train_splits, y_test_splits = [], [], [], []
for split in range(5):
    train = pd.read_csv(f'./training_data/train_{split}.csv', index_col=['file', 'section'])
    test  = pd.read_csv(f'./training_data/test_{split}.csv',  index_col=['file', 'section'])

    X_train_splits.append(train.drop(columns='genre'))
    X_test_splits.append(test.drop(columns='genre'))
    y_train_splits.append(train['genre'])
    y_test_splits.append(test['genre'])

In [None]:
def predict_probabilities(clf, X_test):
    all_probabilities = clf.predict_proba(X_test)
    predicted_probabilities = np.max(all_probabilities, axis=1)
    predicted_classes = clf.classes_[np.argmax(all_probabilities, axis=1)]
    return predicted_classes, predicted_probabilities

In [None]:
Path('./predictions_bl').mkdir(exist_ok=True)
num_combinations = 5 * 2 # 5 splits, full file and time decomposition
i = 0
for split in range(5):
    for section in ['full', 'time_decomposition']:
        i += 1
        print(f'now generating predictions for combination {i} of {num_combinations} - split {split}, method {section}                   ', end='\r')

        if section == 'full':
            ix = pd.IndexSlice[:, ['full']]
        else:
            ix = pd.IndexSlice[:, ['start', 'middle', 'end']]
        X_train = X_train_splits[split].loc[ix, :]
        X_test = X_test_splits[split].loc[ix, :]
        y_train = y_train_splits[split].loc[ix]
        y_test = y_test_splits[split].loc[ix]
        
        clf_decision_tree = DecisionTreeClassifier(random_state=random_state).fit(X_train, y_train)
        result_decision_tree = y_test.to_frame()
        result_decision_tree['predicted'], result_decision_tree['probability'] = predict_probabilities(clf_decision_tree, X_test)
        result_decision_tree.to_csv(f'./predictions_bl/decision_tree_bl_{split}_{section}.csv')

        clf_random_forest = RandomForestClassifier(n_estimators=100, random_state=random_state).fit(X_train, y_train)
        result_random_forest = y_test.to_frame()
        result_random_forest['predicted'], result_random_forest['probability'] = predict_probabilities(clf_random_forest, X_test)
        result_random_forest.to_csv(f'./predictions_bl/random_forest_bl_{split}_{section}.csv')

        clf_knn = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
        result_knn = y_test.to_frame()
        result_knn['predicted'], result_knn['probability'] = predict_probabilities(clf_knn, X_test)
        result_knn.to_csv(f'./predictions_bl/knn_bl_{split}_{section}.csv')

        clf_naive_bayes = GaussianNB().fit(X_train, y_train)
        result_naive_bayes = y_test.to_frame()
        result_naive_bayes['predicted'], result_naive_bayes['probability'] = predict_probabilities(clf_naive_bayes, X_test)
        result_naive_bayes.to_csv(f'./predictions_bl/naive_bayes_bl_{split}_{section}.csv')

        clf_mlp = MLPClassifier(hidden_layer_sizes=(30,), batch_size=X_train.shape[0], max_iter=5000, random_state=random_state).fit(X_train, y_train)
        result_mlp = y_test.to_frame()
        result_mlp['predicted'], result_mlp['probability'] = predict_probabilities(clf_mlp, X_test)
        result_mlp.to_csv(f'./predictions_bl/mlp_bl_{split}_{section}.csv')
print('\ndone!')

### playground

In [None]:
X_train = X_train_splits[0].xs('full', level='section')
X_test = X_test_splits[0].xs('full', level='section')
y_train = y_train_splits[0].xs('full', level='section')
y_test = y_test_splits[0].xs('full', level='section')

In [None]:
len(X_test.iloc[0])

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(30,), batch_size=800, max_iter=5000, random_state=random_state).fit(X_train, y_train)

In [None]:
clf.classes_

In [None]:
clf.predict(X_test)

In [None]:
clf.predict_proba(X_test)

In [None]:
np.max(clf.predict_proba(X_test), axis=1)

In [None]:
predict_probabilities(clf, X_test)