In [1]:
%load_ext autoreload
%autoreload 2

# import data and packages

import pandas as pd
import numpy as np
from helper_functions import test_model, test_bayesian_networks_model

data = pd.read_pickle('input_data.pickle')

In [2]:
import pandas as pd
from math import ceil, floor
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier


def k_fold_cross_validation(k, data_):

    data = data_.reset_index()

    i = len(data) / k

    test_data = data[:floor(i)]
    learn_data = data[floor(i):]
    
    models = [("Naive Bayes", GaussianNB()), ("Neural network", MLPClassifier()), ("k-NN", KNeighborsClassifier())]

    df = pd.DataFrame(columns=["index", "model", "Brier score", 
                               "precision", "recall", "F1", 
                               "confusion_matrix", "profit"])

    index = 0

    for (name, model) in models:
        brier_score, precision, recall, F1, confusion_matrix, profit = \
            test_model(learn_data, test_data, model)

        new_line = pd.DataFrame.from_dict({
            "index": [index],
            "model": [name], 
            "Brier score": [brier_score], 
            "precision": [precision], 
            "recall": [recall], 
            "F1": [F1], 
            "confusion_matrix": [confusion_matrix], 
            "profit": [profit]
        })

        df = pd.concat([df, new_line])

        index += 1

    new_line = pd.DataFrame.from_dict({
                "index": [index],
                "model": ["Bayesian network"], 
                "Brier score": [brier_score], 
                "precision": [precision], 
                "recall": [recall], 
                "F1": [F1], 
                "confusion_matrix": [confusion_matrix], 
                "profit": [profit]
            })

    df = pd.concat([df, new_line])

    index += 1

    for j in range(1, k - 1):
        bound1 = ceil((j * i))
        bound2 = floor((j + 1) * i)

        test_data = data[bound1:bound2]

        learn_data_1 = data[:bound1]
        learn_data_2 = data[bound2:]
        learn_data = pd.concat([learn_data_1, learn_data_2])
        
        for (name, model) in models:
            brier_score, precision, recall, F1, confusion_matrix, profit = \
                test_model(learn_data, test_data, model)

            new_line = pd.DataFrame.from_dict({
                "index": [index],
                "model": [name], 
                "Brier score": [brier_score], 
                "precision": [precision], 
                "recall": [recall], 
                "F1": [F1], 
                "confusion_matrix": [confusion_matrix], 
                "profit": [profit]
            })

            df = pd.concat([df, new_line])

            index += 1

        brier_score, precision, recall, F1, confusion_matrix, profit = \
            test_bayesian_networks_model(learn_data, test_data)

        new_line = pd.DataFrame.from_dict({
                "index": [index],
                "model": ["Bayesian network"], 
                "Brier score": [brier_score], 
                "precision": [precision], 
                "recall": [recall], 
                "F1": [F1], 
                "confusion_matrix": [confusion_matrix], 
                "profit": [profit]
            })

        df = pd.concat([df, new_line])

        index += 1

    test_data = data[ceil(((k - 1) * i)):]
    learn_data = data[:ceil(((k - 1) * i))]
    
    
    for (name, model) in models:
        brier_score, precision, recall, F1, confusion_matrix, profit = \
            test_model(learn_data, test_data, model)

        new_line = pd.DataFrame.from_dict({
            "index": [index],
            "model": [name], 
            "Brier score": [brier_score], 
            "precision": [precision], 
            "recall": [recall], 
            "F1": [F1], 
            "confusion_matrix": [confusion_matrix], 
            "profit": [profit]
        })

        df = pd.concat([df, new_line])

        index += 1

    brier_score, precision, recall, F1, confusion_matrix, profit = \
        test_bayesian_networks_model(learn_data, test_data)

    new_line = pd.DataFrame.from_dict({
            "index": [index],
            "model": ["Bayesian network"], 
            "Brier score": [brier_score], 
            "precision": [precision], 
            "recall": [recall], 
            "F1": [F1], 
            "confusion_matrix": [confusion_matrix], 
            "profit": [profit]
        })

    df = pd.concat([df, new_line])

    index += 1

    return df


df = k_fold_cross_validation(10, data)

df

Unnamed: 0,index,model,Brier score,precision,recall,F1,confusion_matrix,profit
0,0,Naive Bayes,213.672647,0.610811,0.71519,0.658892,"[[113, 6, 39], [34, 12, 26], [38, 12, 51]]",-7.57
0,1,Neural network,201.948271,0.521569,0.841772,0.644068,"[[133, 2, 23], [53, 0, 19], [69, 0, 32]]",58.38
0,2,k-NN,241.44,0.540541,0.759494,0.631579,"[[120, 15, 23], [46, 8, 18], [56, 17, 28]]",10.33
0,3,Bayesian network,241.44,0.540541,0.759494,0.631579,"[[120, 15, 23], [46, 8, 18], [56, 17, 28]]",10.33
0,4,Naive Bayes,199.269697,0.565934,0.677632,0.616766,"[[103, 11, 38], [42, 8, 26], [37, 9, 57]]",-46.47
0,5,Neural network,195.154456,0.562814,0.736842,0.638177,"[[112, 13, 27], [42, 8, 26], [45, 10, 48]]",-75.21
0,6,k-NN,220.56,0.522727,0.756579,0.61828,"[[115, 21, 16], [51, 14, 11], [54, 19, 30]]",-54.68
0,7,Bayesian network,204.150574,0.496479,0.927632,0.646789,"[[141, 1, 10], [68, 0, 8], [75, 1, 27]]",-25.48
0,8,Naive Bayes,213.501522,0.601093,0.714286,0.652819,"[[110, 16, 28], [41, 15, 28], [32, 13, 47]]",-41.66
0,9,Neural network,199.902775,0.571429,0.753247,0.64986,"[[116, 20, 18], [50, 15, 19], [37, 19, 36]]",-44.54


In [3]:
df.to_csv('../analysis/data.csv')
df.to_pickle('../analysis/data.pickle')