In [4]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.utils.class_weight import compute_sample_weight

def build_confusion_matrix(dataset, model):
        X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1], dataset.iloc[:, -1], test_size=0.25)
        predictions = model.fit(X_train, y_train).predict(X_test)
        tn, fn, fp, tp = confusion_matrix(predictions, y_test.values).ravel()
        return {'true_negative': tn, 'false_negative': fn, 'false_positive': fp, 'true_positive': tp}

def sensitivity(cm):
    return cm['true_positive'] / (cm['true_positive'] + cm['false_negative'])

def specificity(cm):
    return cm['true_negative'] / ( cm['true_negative'] + cm['false_positive'] )

def false_positive_rate(cm):
    return (1.0 - (specificity(cm)) )

def generate_scores(dataset, model):
    specificity_scores = []
    sensitivity_scores = []

    for n in range(1000):
        cm = build_confusion_matrix(dataset, model)
        specificity_scores.append(specificity(cm))
        sensitivity_scores.append(sensitivity(cm))
    
    spec_avg = pd.Series(specificity_scores).mean()
    sens_avg = pd.Series(sensitivity_scores).mean()
    
    return {'specificity': spec_avg, 'sensitivity': sens_avg }

def summarise_test(test):
    x ='For {}, in model {}\nAverage NO SIGFALL accuracy is {}\nAverage SIGFALL accuracy is {}\n---\n'
    print(x.format(test['data_name'],test['model'],test['specificity'],test['sensitivity']))    


In [5]:
# Load data sets
inc_updrs_sigfall = pd.read_csv('./working_data/normalised_increase_updrs_sigfall.csv')
inc_updrs_sigfall_raw = pd.read_csv('./working_data/increase_updrs_sigfall_raw.csv')
delta_updrs_sigfall = pd.read_csv('./working_data/normalised_delta_updrs_sigfall.csv')
delta_updrs_sigfall_raw = pd.read_csv('./working_data/delta_updrs_sigfall_raw.csv')
updrs_future_sigfall = pd.read_csv('./working_data/normalised_updrs_future_sigfall.csv')
updrs_future_sigfall_raw = pd.read_csv('./working_data/updrs_future_sigfall_raw.csv')

datasets = [{'data':inc_updrs_sigfall, 'name':'inc_updrs_sigfall'},
            {'data':inc_updrs_sigfall_raw, 'name':'inc_updrs_sigfall_raw'},
            {'data':delta_updrs_sigfall , 'name':'delta_updrs_sigfall'},
            {'data':delta_updrs_sigfall_raw , 'name':'delta_updrs_sigfall_raw'},
            {'data':updrs_future_sigfall , 'name':'updrs_future_sigfall'},
            {'data':updrs_future_sigfall_raw , 'name':'updrs_future_sigfall_raw'}
           ]

In [30]:
# Gaussian Naive Bayes Specific:
# Does sample weighting affect the model accuracy?  -> Apparently not...

X_train, X_test, y_train, y_test = train_test_split(inc_updrs_sigfall.iloc[:, :-1], inc_updrs_sigfall.iloc[:, -1], test_size=0.25)

for n in range(1,1000,100):
    m = GaussianNB()
    m.fit(X_train, y_train, sample_weight=compute_sample_weight(class_weight={0.0:1, 1.0:n}, y=y_train))
    print(f'--- {0} ---',n)
    s = generate_scores(dataset=inc_updrs_sigfall, model=m)
    print(s['specificity'])
    print(s['sensitivity'])
    print('\n')
    

--- 0 --- 1
0.8563502958450656
0.4019338500599539


--- 0 --- 101
0.8580780199747803
0.401576093524014


--- 0 --- 201
0.8572664316406499
0.40052575404731433


--- 0 --- 301
0.857884307650238
0.40326144587668017


--- 0 --- 401
0.8575113049216329
0.4026009179263204


--- 0 --- 501
0.8582181776830138
0.40371912299222373


--- 0 --- 601
0.8564617142800797
0.4014214797876678


--- 0 --- 701
0.8576404473274515
0.40375439959641846


--- 0 --- 801
0.8564426124167857
0.40224346436078523


--- 0 --- 901
0.8576523689852742
0.4068723959017821




In [6]:
# Run tests to score each model & dataset combination

models = [
    GaussianNB(priors=[0.01,0.99]),
    GaussianNB(priors=[0.03,0.97]),
    GaussianNB(priors=[0.05,0.95]),
    GaussianNB(priors=[0.07,0.93]),
    GaussianNB(priors=[0.09,0.91]),
    GaussianNB(priors=[0.2,0.8]),
    GaussianNB(priors=[0.3,0.7]),
    GaussianNB(priors=[0.4,0.6]),
    GaussianNB(priors=[0.5,0.5]),
]

tests = []
for m in models:
    for d in datasets:
        tests.append({
            'data_name': d['name'],
            'dataset': d['data'],
            'model': m
        })

for t in tests:
    s = generate_scores(dataset=t['dataset'], model=t['model'])
    t['specificity'] = s['specificity']
    t['sensitivity'] = s['sensitivity']
    summarise_test(t)

For inc_updrs_sigfall, in model GaussianNB(priors=[0.01, 0.99])
Average NO SIGFALL accuracy is 0.6311165721731329
Average SIGFALL accuracy is 0.72417432742307
---

For inc_updrs_sigfall_raw, in model GaussianNB(priors=[0.01, 0.99])
Average NO SIGFALL accuracy is 0.5816298040406024
Average SIGFALL accuracy is 0.768520970959382
---

For delta_updrs_sigfall, in model GaussianNB(priors=[0.01, 0.99])
Average NO SIGFALL accuracy is 0.32218293541258586
Average SIGFALL accuracy is 0.9296755275598501
---

For delta_updrs_sigfall_raw, in model GaussianNB(priors=[0.01, 0.99])
Average NO SIGFALL accuracy is 0.32119357224441536
Average SIGFALL accuracy is 0.9321484364763228
---

For updrs_future_sigfall, in model GaussianNB(priors=[0.01, 0.99])
Average NO SIGFALL accuracy is 0.5220747521693178
Average SIGFALL accuracy is 0.7282656002981391
---

For updrs_future_sigfall_raw, in model GaussianNB(priors=[0.01, 0.99])
Average NO SIGFALL accuracy is 0.4820086380668731
Average SIGFALL accuracy is 0.76484

For inc_updrs_sigfall_raw, in model GaussianNB(priors=[0.5, 0.5])
Average NO SIGFALL accuracy is 0.8030149384937607
Average SIGFALL accuracy is 0.47639549685040317
---

For delta_updrs_sigfall, in model GaussianNB(priors=[0.5, 0.5])
Average NO SIGFALL accuracy is 0.6939663175686912
Average SIGFALL accuracy is 0.6171330865989899
---

For delta_updrs_sigfall_raw, in model GaussianNB(priors=[0.5, 0.5])
Average NO SIGFALL accuracy is 0.7116422744512791
Average SIGFALL accuracy is 0.5937802369244591
---

For updrs_future_sigfall, in model GaussianNB(priors=[0.5, 0.5])
Average NO SIGFALL accuracy is 0.7475680865724744
Average SIGFALL accuracy is 0.4871537227579297
---

For updrs_future_sigfall_raw, in model GaussianNB(priors=[0.5, 0.5])
Average NO SIGFALL accuracy is 0.7466007440218843
Average SIGFALL accuracy is 0.493500044269209
---

