In [1]:
%load_ext autoreload
%autoreload 2

# import data and packages

import pandas as pd
import numpy as np
from helper_functions import test_model, test_bayesian_networks_model

data = pd.read_pickle('input_data.pickle')

In [2]:
import pandas as pd
from math import ceil, floor
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier


def k_fold_cross_validation(k, data_):

    data = data_.reset_index()

    i = len(data) / k

    test_data = data[:floor(i)]
    learn_data = data[floor(i):]
    
    models = [("Naive Bayes", GaussianNB()), ("Neural network", MLPClassifier()), ("k-NN", KNeighborsClassifier(n_neighbors=5))]

    df = pd.DataFrame(columns=["index", "model", "Brier score", 
                               "precision", "recall", "F1", 
                               "confusion_matrix", "profit", "modified Brier score"])

    index = 0

    for (name, model) in models:
        brier_score, precision, recall, F1, confusion_matrix, profit, modified_brier_score = \
            test_model(learn_data, test_data, model)

        new_line = pd.DataFrame.from_dict({
            "index": [index],
            "model": [name], 
            "Brier score": [brier_score], 
            "precision": [precision], 
            "recall": [recall], 
            "F1": [F1], 
            "confusion_matrix": [confusion_matrix], 
            "profit": [profit],
            "modified Brier score": [modified_brier_score]
        })

        df = pd.concat([df, new_line])

        index += 1

    new_line = pd.DataFrame.from_dict({
                "index": [index],
                "model": ["Bayesian network"], 
                "Brier score": [brier_score], 
                "precision": [precision], 
                "recall": [recall], 
                "F1": [F1], 
                "confusion_matrix": [confusion_matrix], 
                "profit": [profit],
                "modified Brier score": [modified_brier_score]
            })

    df = pd.concat([df, new_line])

    index += 1

    for j in range(1, k - 1):
        bound1 = ceil((j * i))
        bound2 = floor((j + 1) * i)

        test_data = data[bound1:bound2]

        learn_data_1 = data[:bound1]
        learn_data_2 = data[bound2:]
        learn_data = pd.concat([learn_data_1, learn_data_2])
        
        for (name, model) in models:
            brier_score, precision, recall, F1, confusion_matrix, profit, modified_brier_score = \
                test_model(learn_data, test_data, model)

            new_line = pd.DataFrame.from_dict({
                "index": [index],
                "model": [name], 
                "Brier score": [brier_score], 
                "precision": [precision], 
                "recall": [recall], 
                "F1": [F1], 
                "confusion_matrix": [confusion_matrix], 
                "profit": [profit],
                "modified Brier score": [modified_brier_score]
            })

            df = pd.concat([df, new_line])

            index += 1

        brier_score, precision, recall, F1, confusion_matrix, profit, modified_brier_score = \
            test_bayesian_networks_model(learn_data, test_data)

        new_line = pd.DataFrame.from_dict({
                "index": [index],
                "model": ["Bayesian network"], 
                "Brier score": [brier_score], 
                "precision": [precision], 
                "recall": [recall], 
                "F1": [F1], 
                "confusion_matrix": [confusion_matrix], 
                "profit": [profit],
                "modified Brier score": [modified_brier_score]
            })

        df = pd.concat([df, new_line])

        index += 1

    test_data = data[ceil(((k - 1) * i)):]
    learn_data = data[:ceil(((k - 1) * i))]
    
    
    for (name, model) in models:
        brier_score, precision, recall, F1, confusion_matrix, profit, modified_brier_score = \
            test_model(learn_data, test_data, model)

        new_line = pd.DataFrame.from_dict({
            "index": [index],
            "model": [name], 
            "Brier score": [brier_score], 
            "precision": [precision], 
            "recall": [recall], 
            "F1": [F1], 
            "confusion_matrix": [confusion_matrix], 
            "profit": [profit],
            "modified Brier score": [modified_brier_score]
        })

        df = pd.concat([df, new_line])

        index += 1

    brier_score, precision, recall, F1, confusion_matrix, profit, modified_brier_score = \
        test_bayesian_networks_model(learn_data, test_data)

    new_line = pd.DataFrame.from_dict({
            "index": [index],
            "model": ["Bayesian network"], 
            "Brier score": [brier_score], 
            "precision": [precision], 
            "recall": [recall], 
            "F1": [F1], 
            "confusion_matrix": [confusion_matrix], 
            "profit": [profit],
            "modified Brier score": [modified_brier_score]
        })

    df = pd.concat([df, new_line])

    index += 1

    return df


df = k_fold_cross_validation(10, data)

df

Unnamed: 0,index,model,Brier score,precision,recall,F1,confusion_matrix,profit,modified Brier score
0,0,Naive Bayes,0.645537,0.610811,0.71519,0.658892,"[[113, 6, 39], [34, 12, 26], [38, 12, 51]]",-0.02287,0.093121
0,1,Neural network,0.613954,0.621429,0.550633,0.583893,"[[87, 12, 59], [26, 13, 33], [27, 12, 62]]",0.093746,0.073383
0,2,k-NN,0.729426,0.540541,0.759494,0.631579,"[[120, 15, 23], [46, 8, 18], [56, 17, 28]]",0.031208,0.153728
0,3,Bayesian network,0.729426,0.540541,0.759494,0.631579,"[[120, 15, 23], [46, 8, 18], [56, 17, 28]]",0.031208,0.153728
0,4,Naive Bayes,0.602023,0.565934,0.677632,0.616766,"[[103, 11, 38], [42, 8, 26], [37, 9, 57]]",-0.140393,0.058105
0,5,Neural network,0.624752,0.589286,0.651316,0.61875,"[[99, 51, 2], [32, 40, 4], [37, 47, 19]]",-0.252991,0.079742
0,6,k-NN,0.666344,0.522727,0.756579,0.61828,"[[115, 21, 16], [51, 14, 11], [54, 19, 30]]",-0.165196,0.151911
0,7,Bayesian network,0.616769,0.496479,0.927632,0.646789,"[[141, 1, 10], [68, 0, 8], [75, 1, 27]]",-0.076979,0.075552
0,8,Naive Bayes,0.646974,0.601093,0.714286,0.652819,"[[110, 16, 28], [41, 15, 28], [32, 13, 47]]",-0.126242,0.085236
0,9,Neural network,0.613374,0.569948,0.714286,0.634006,"[[110, 0, 44], [45, 2, 37], [38, 0, 54]]",-0.218636,0.05337


In [3]:
df.to_csv('../analysis/data.csv')
df.to_pickle('../analysis/data.pickle')