In [1]:
%load_ext autoreload
%autoreload 2

# import data and packages

import pandas as pd
import numpy as np
from helper_functions import test_model, test_bayesian_networks_model

data = pd.read_pickle('input_data.pickle')

In [2]:
import pandas as pd
from math import ceil, floor
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier


def k_fold_cross_validation(k, data_):

    data = data_.reset_index()

    i = len(data) / k

    test_data = data[:floor(i)]
    learn_data = data[floor(i):]
    
    models = [("Naive Bayes", GaussianNB()), ("Neural network", MLPClassifier()), ("k-NN", KNeighborsClassifier(n_neighbors=5))]

    df = pd.DataFrame(columns=["index", "model", "Brier score", 
                               "precision", "recall", "F1", 
                               "confusion_matrix", "profit", "modified Brier score"])

    index = 0

    for (name, model) in models:
        brier_score, precision, recall, F1, confusion_matrix, profit, modified_brier_score = \
            test_model(learn_data, test_data, model)

        new_line = pd.DataFrame.from_dict({
            "index": [index],
            "model": [name], 
            "Brier score": [brier_score], 
            "precision": [precision], 
            "recall": [recall], 
            "F1": [F1], 
            "confusion_matrix": [confusion_matrix], 
            "profit": [profit],
            "modified Brier score": [modified_brier_score]
        })

        df = pd.concat([df, new_line])

        index += 1

    new_line = pd.DataFrame.from_dict({
                "index": [index],
                "model": ["Bayesian network"], 
                "Brier score": [brier_score], 
                "precision": [precision], 
                "recall": [recall], 
                "F1": [F1], 
                "confusion_matrix": [confusion_matrix], 
                "profit": [profit],
                "modified Brier score": [modified_brier_score]
            })

    df = pd.concat([df, new_line])

    index += 1

    for j in range(1, k - 1):
        bound1 = ceil((j * i))
        bound2 = floor((j + 1) * i)

        test_data = data[bound1:bound2]

        learn_data_1 = data[:bound1]
        learn_data_2 = data[bound2:]
        learn_data = pd.concat([learn_data_1, learn_data_2])
        
        for (name, model) in models:
            brier_score, precision, recall, F1, confusion_matrix, profit, modified_brier_score = \
                test_model(learn_data, test_data, model)

            new_line = pd.DataFrame.from_dict({
                "index": [index],
                "model": [name], 
                "Brier score": [brier_score], 
                "precision": [precision], 
                "recall": [recall], 
                "F1": [F1], 
                "confusion_matrix": [confusion_matrix], 
                "profit": [profit],
                "modified Brier score": [modified_brier_score]
            })

            df = pd.concat([df, new_line])

            index += 1

        brier_score, precision, recall, F1, confusion_matrix, profit, modified_brier_score = \
            test_bayesian_networks_model(learn_data, test_data)

        new_line = pd.DataFrame.from_dict({
                "index": [index],
                "model": ["Bayesian network"], 
                "Brier score": [brier_score], 
                "precision": [precision], 
                "recall": [recall], 
                "F1": [F1], 
                "confusion_matrix": [confusion_matrix], 
                "profit": [profit],
                "modified Brier score": [modified_brier_score]
            })

        df = pd.concat([df, new_line])

        index += 1

    test_data = data[ceil(((k - 1) * i)):]
    learn_data = data[:ceil(((k - 1) * i))]
    
    
    for (name, model) in models:
        brier_score, precision, recall, F1, confusion_matrix, profit, modified_brier_score = \
            test_model(learn_data, test_data, model)

        new_line = pd.DataFrame.from_dict({
            "index": [index],
            "model": [name], 
            "Brier score": [brier_score], 
            "precision": [precision], 
            "recall": [recall], 
            "F1": [F1], 
            "confusion_matrix": [confusion_matrix], 
            "profit": [profit],
            "modified Brier score": [modified_brier_score]
        })

        df = pd.concat([df, new_line])

        index += 1

    brier_score, precision, recall, F1, confusion_matrix, profit, modified_brier_score = \
        test_bayesian_networks_model(learn_data, test_data)

    new_line = pd.DataFrame.from_dict({
            "index": [index],
            "model": ["Bayesian network"], 
            "Brier score": [brier_score], 
            "precision": [precision], 
            "recall": [recall], 
            "F1": [F1], 
            "confusion_matrix": [confusion_matrix], 
            "profit": [profit],
            "modified Brier score": [modified_brier_score]
        })

    df = pd.concat([df, new_line])

    index += 1

    return df


df = k_fold_cross_validation(10, data)

df

Unnamed: 0,index,model,Brier score,precision,recall,F1,confusion_matrix,profit,modified Brier score
0,0,Naive Bayes,213.672647,0.610811,0.71519,0.658892,"[[113, 6, 39], [34, 12, 26], [38, 12, 51]]",-0.02287,30.822969
0,1,Neural network,205.593818,0.538136,0.803797,0.64467,"[[127, 18, 13], [48, 15, 9], [61, 17, 23]]",-0.000665,18.235955
0,2,k-NN,241.44,0.540541,0.759494,0.631579,"[[120, 15, 23], [46, 8, 18], [56, 17, 28]]",0.031208,50.883994
0,3,Bayesian network,241.44,0.540541,0.759494,0.631579,"[[120, 15, 23], [46, 8, 18], [56, 17, 28]]",0.031208,50.883994
0,4,Naive Bayes,199.269697,0.565934,0.677632,0.616766,"[[103, 11, 38], [42, 8, 26], [37, 9, 57]]",-0.140393,19.232607
0,5,Neural network,204.663858,0.598425,0.5,0.544803,"[[76, 0, 76], [25, 0, 51], [26, 3, 74]]",-0.091541,28.684729
0,6,k-NN,220.56,0.522727,0.756579,0.61828,"[[115, 21, 16], [51, 14, 11], [54, 19, 30]]",-0.165196,50.282553
0,7,Bayesian network,204.150574,0.496479,0.927632,0.646789,"[[141, 1, 10], [68, 0, 8], [75, 1, 27]]",-0.076979,25.007782
0,8,Naive Bayes,213.501522,0.601093,0.714286,0.652819,"[[110, 16, 28], [41, 15, 28], [32, 13, 47]]",-0.126242,28.127805
0,9,Neural network,200.647463,0.535714,0.876623,0.665025,"[[135, 10, 9], [60, 8, 16], [57, 12, 23]]",-0.217939,15.447441


In [3]:
df.to_csv('../analysis/data.csv')
df.to_pickle('../analysis/data.pickle')