In [2]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB


def build_confusion_matrix(dataset, model):
        X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1], dataset.iloc[:, -1], test_size=0.25)
        predictions = model.fit(X_train, y_train).predict(X_test)
        tn, fn, fp, tp = confusion_matrix(predictions, y_test.values).ravel()
        return {'true_negative': tn, 'false_negative': fn, 'false_positive': fp, 'true_positive': tp}

def sensitivity(cm):
    return cm['true_positive'] / (cm['true_positive'] + cm['false_negative'])

def specificity(cm):
    return cm['true_negative'] / ( cm['true_negative'] + cm['false_positive'] )

def false_positive_rate(cm):
    return (1.0 - (specificity(cm)) )

def generate_scores(dataset, model):
    specificity_scores = []
    sensitivity_scores = []

    for n in range(1000):
        cm = build_confusion_matrix(dataset, model)
        specificity_scores.append(specificity(cm))
        sensitivity_scores.append(sensitivity(cm))
    
    spec_avg = pd.Series(specificity_scores).mean()
    sens_avg = pd.Series(sensitivity_scores).mean()
    
    return {'specificity': spec_avg, 'sensitivity': sens_avg }

def summarise_test(test):
    x ='For {}, in model {}\nAverage NO SIGFALL accuracy is {}\nAverage SIGFALL accuracy is {}\n---\n'
    print(x.format(test['data_name'],test['model'],test['specificity'],test['sensitivity']))    


In [3]:
# Load data sets
inc_updrs_sigfall = pd.read_csv('./working_data/normalised_increase_updrs_sigfall.csv')
inc_updrs_sigfall_raw = pd.read_csv('./working_data/normalised_increase_updrs_sigfall_raw.csv')

datasets = [{'data':inc_updrs_sigfall, 'name':'inc_updrs_sigfall'}, 
            {'data':inc_updrs_sigfall_raw, 'name':'inc_updrs_sigfall_raw'}]

In [4]:

models = [
    BernoulliNB(alpha=1, fit_prior=True),
    BernoulliNB(alpha=0.75, fit_prior=True),
    BernoulliNB(alpha=0.5, fit_prior=True),
    BernoulliNB(alpha=0.25, fit_prior=True),
    BernoulliNB(alpha=0.1, fit_prior=True),
    BernoulliNB(alpha=1, fit_prior=False),
    BernoulliNB(alpha=0.75, fit_prior=False),
    BernoulliNB(alpha=0.5, fit_prior=False),
    BernoulliNB(alpha=0.25, fit_prior=False),
    BernoulliNB(alpha=0.1, fit_prior=False),
]

tests = []
for m in models:
    for d in datasets:
        tests.append({
            'data_name': d['name'],
            'dataset': d['data'],
            'model': m
        })

for t in tests:
    s = generate_scores(dataset=t['dataset'], model=t['model'])
    t['specificity'] = s['specificity']
    t['sensitivity'] = s['sensitivity']
    summarise_test(t)

For inc_updrs_sigfall, in model BernoulliNB(alpha=1)
Average NO SIGFALL accuracy is 0.8825233196112854
Average SIGFALL accuracy is 0.3257411213928927
---

For inc_updrs_sigfall_raw, in model BernoulliNB(alpha=1)
Average NO SIGFALL accuracy is 0.9081430198885614
Average SIGFALL accuracy is 0.28705507967924215
---

For inc_updrs_sigfall, in model BernoulliNB(alpha=0.75)
Average NO SIGFALL accuracy is 0.8833808618947516
Average SIGFALL accuracy is 0.3235664326550509
---

For inc_updrs_sigfall_raw, in model BernoulliNB(alpha=0.75)
Average NO SIGFALL accuracy is 0.9089090629968954
Average SIGFALL accuracy is 0.2874883127546389
---

For inc_updrs_sigfall, in model BernoulliNB(alpha=0.5)
Average NO SIGFALL accuracy is 0.8822809840311528
Average SIGFALL accuracy is 0.327106590504565
---

For inc_updrs_sigfall_raw, in model BernoulliNB(alpha=0.5)
Average NO SIGFALL accuracy is 0.9092762532839377
Average SIGFALL accuracy is 0.2882028285098238
---

For inc_updrs_sigfall, in model BernoulliNB(alph

In [5]:
# The smoothing function appears to have little affect
# Priors from data reduce prediction of SIGFALL due to imbalance in the data set

# Can we define false priors to increase SIGFALL accuracy? -> No, overfits to the opposite problem, everything is SIGFALL

fp_models = [
    BernoulliNB(alpha=1, fit_prior=False, class_prior=[0.1,0.9]),
    BernoulliNB(alpha=1, fit_prior=False, class_prior=[0.2,0.8]),
    BernoulliNB(alpha=1, fit_prior=False, class_prior=[0.3,0.7]),
    BernoulliNB(alpha=1, fit_prior=False, class_prior=[0.4,0.6]),
    BernoulliNB(alpha=1, fit_prior=False, class_prior=[0.5,0.5]),
]

fp_tests = []
for m in fp_models:
    for d in datasets:
        fp_tests.append({
            'data_name': d['name'],
            'dataset': d['data'],
            'model': m
        })

for t in fp_tests:
    s = generate_scores(dataset=t['dataset'], model=t['model'])
    t['specificity'] = s['specificity']
    t['sensitivity'] = s['sensitivity']
    summarise_test(t)


For inc_updrs_sigfall, in model BernoulliNB(alpha=1, class_prior=[0.1, 0.9], fit_prior=False)
Average NO SIGFALL accuracy is 0.5207054993368325
Average SIGFALL accuracy is 0.8371007135161856
---

For inc_updrs_sigfall_raw, in model BernoulliNB(alpha=1, class_prior=[0.1, 0.9], fit_prior=False)
Average NO SIGFALL accuracy is 0.4731387760242082
Average SIGFALL accuracy is 0.8547942074457625
---

For inc_updrs_sigfall, in model BernoulliNB(alpha=1, class_prior=[0.2, 0.8], fit_prior=False)
Average NO SIGFALL accuracy is 0.59977622795493
Average SIGFALL accuracy is 0.7257235125130699
---

For inc_updrs_sigfall_raw, in model BernoulliNB(alpha=1, class_prior=[0.2, 0.8], fit_prior=False)
Average NO SIGFALL accuracy is 0.5849249178412271
Average SIGFALL accuracy is 0.7834443304446795
---

For inc_updrs_sigfall, in model BernoulliNB(alpha=1, class_prior=[0.3, 0.7], fit_prior=False)
Average NO SIGFALL accuracy is 0.6597078275041577
Average SIGFALL accuracy is 0.6823776022594242
---

For inc_updrs_