In [None]:
from collections import defaultdict

import pandas as pd

In [None]:
classifiers = ['decision_tree', 'random_forest', 'knn', 'naive_bayes', 'mlp']

use_weighted_voting = False

### baseline

In [None]:
for classifier in classifiers:
    for split in range(5):
        time_decomposition_results = pd.read_csv(f'./predictions_bl/{classifier}_bl_{split}_time_decomposition.csv', index_col=['file', 'section'])
        combined_prediction = pd.DataFrame(time_decomposition_results.xs('start', level='section')['genre'],
                                           index=time_decomposition_results.xs('start', level='section').index,
                                           columns=['genre', 'predicted', 'probability_share'])

        for i in combined_prediction.index:
            results = time_decomposition_results.loc[pd.IndexSlice[i, :], :]
            weighted_votes = defaultdict(float)
            for r in results.itertuples():
                weighted_votes[r.predicted] += r.probability if use_weighted_voting else 1

            # catch edge case of three different genres, all with same probability
            if len(weighted_votes) == 3 and len(set(weighted_votes.values())) == 1:
                combined_prediction.loc[i, 'predicted'] = time_decomposition_results.loc[(i, 'middle'), 'predicted']
                combined_prediction.loc[i, 'probability_share'] = 1/3
            # otherwise choose genre with highest total weighted vote
            else:
                weighted_votes = sorted(list(weighted_votes.items()), key=lambda x: x[1], reverse=True)
                combined_prediction.loc[i, 'predicted'] = weighted_votes[0][0]
                combined_prediction.loc[i, 'probability_share'] = weighted_votes[0][1] / sum(x[1] for x in weighted_votes)

        combined_prediction.to_csv(f'./predictions_bl/{classifier}_bl_{split}_combined.csv')

### one-against-all

In [None]:
classifiers = ['decision_tree', 'random_forest', 'knn', 'naive_bayes', 'mlp', 'svm']
#classifiers = ['random_forest', 'knn', 'naive_bayes', 'mlp', 'svm']
for classifier in classifiers:
    for split in range(5):
        time_decomposition_results = pd.read_csv(f'./predictions_oaa/{classifier}_oaa_{split}_time_decomposition.csv', index_col=['file', 'section'])
        combined_prediction = pd.DataFrame(time_decomposition_results.xs('start', level='section')['genre'],
                                           index=time_decomposition_results.xs('start', level='section').index,
                                           columns=['genre', 'predicted', 'probability_share'])
        
        for i in combined_prediction.index:
            results = time_decomposition_results.loc[pd.IndexSlice[i, :], :]
            weighted_votes = defaultdict(float)
            for r in results.itertuples():
                weighted_votes[r.predicted] += r.score if use_weighted_voting else 1
                
            # catch edge case of three different genres, all with same probability
            if len(weighted_votes) == 3 and len(set(weighted_votes.values())) == 1:
                combined_prediction.loc[i, 'predicted'] = time_decomposition_results.loc[(i, 'middle'), 'predicted']
                combined_prediction.loc[i, 'score'] = 1/3
            # otherwise choose genre with highest total weighted vote
            else:
                weighted_votes = sorted(list(weighted_votes.items()), key=lambda x: x[1], reverse=True)
                combined_prediction.loc[i, 'predicted'] = weighted_votes[0][0]
                if sum(x[1] for x in weighted_votes) == 0:
                    combined_prediction.loc[i, 'score'] = 0
                else:
                    combined_prediction.loc[i, 'score'] = weighted_votes[0][1] / sum(x[1] for x in weighted_votes)

        combined_prediction.to_csv(f'./predictions_oaa/{classifier}_oaa_{split}_combined.csv')

### round-robin

In [None]:
classifiers = ['decision_tree', 'random_forest', 'knn', 'naive_bayes', 'mlp', 'svm']
#classifiers = ['random_forest', 'knn', 'naive_bayes', 'mlp', 'svm']
for classifier in classifiers:
    for split in range(5):
        time_decomposition_results = pd.read_csv(f'./predictions_rr/{classifier}_rr_{split}_time_decomposition.csv', index_col=['file', 'section'])
        combined_prediction = pd.DataFrame(time_decomposition_results.xs('start', level='section')['genre'],
                                           index=time_decomposition_results.xs('start', level='section').index,
                                           columns=['genre', 'predicted', 'probability_share'])
        
        for i in combined_prediction.index:
            results = time_decomposition_results.loc[pd.IndexSlice[i, :], :]
            weighted_votes = defaultdict(float)
            for r in results.itertuples():
                weighted_votes[r.predicted] += r.score if use_weighted_voting else 1
                
            # catch edge case of three different genres, all with same probability
            if len(weighted_votes) == 3 and len(set(weighted_votes.values())) == 1:
                combined_prediction.loc[i, 'predicted'] = time_decomposition_results.loc[(i, 'middle'), 'predicted']
                combined_prediction.loc[i, 'score'] = 1/3
            # otherwise choose genre with highest total weighted vote
            else:
                weighted_votes = sorted(list(weighted_votes.items()), key=lambda x: x[1], reverse=True)
                combined_prediction.loc[i, 'predicted'] = weighted_votes[0][0]
                if sum(x[1] for x in weighted_votes) == 0:
                    combined_prediction.loc[i, 'score'] = 0
                else:
                    combined_prediction.loc[i, 'score'] = weighted_votes[0][1] / sum(x[1] for x in weighted_votes)

        combined_prediction.to_csv(f'./predictions_rr/{classifier}_rr_{split}_combined.csv')