In [210]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

updrs_sigfall = pd.read_csv('./working_data/normalised_updrs_sigfall.csv')
updrs_sigfall_raw = pd.read_csv('./working_data/normalised_updrs_sigfall_raw.csv')
updrs_future_sigfall = pd.read_csv('./working_data/normalised_updrs_future_sigfall.csv')
updrs_future_sigfall_raw = pd.read_csv('./working_data/normalised_updrs_future_sigfall_raw.csv')
inc_updrs_sigfall = pd.read_csv('./working_data/normalised_increase_updrs_sigfall.csv')
inc_updrs_sigfall_raw = pd.read_csv('./working_data/normalised_increase_updrs_sigfall_raw.csv')
delta_updrs_sigfall = pd.read_csv('./working_data/normalised_delta_updrs_sigfall.csv')
delta_updrs_sigfall_raw = pd.read_csv('./working_data/normalised_delta_updrs_sigfall_raw.csv')

data_sources = {'updrs_sigfall': updrs_sigfall,
                'updrs_future_sigfall': updrs_future_sigfall,
                'inc_updrs_sigfall': inc_updrs_sigfall,
                'delta_updrs_sigfall': delta_updrs_sigfall}

raw_data_sources = {'updrs_sigfall_raw': updrs_sigfall_raw,
                    'updrs_future_sigfall_raw': updrs_future_sigfall_raw,
                    'inc_updrs_sigfall_raw': inc_updrs_sigfall_raw,
                    'delta_updrs_sigfall_raw': delta_updrs_sigfall_raw }

## Straight models
Running the working data directly through Linear Discriminant Analysis models
Each dataset is Cross-fold validated where k=10
Results analyzed for like for like comparison

In [172]:
def lda_model_score(dataset):
    try:
        X = dataset.drop(columns=['SIGFALL'])
    except:
        X = dataset.drop(columns=['SIGFALL_NEXT'])
        
    try:    
        y = dataset['SIGFALL']
    except:
        y = dataset['SIGFALL_NEXT']

    model = LinearDiscriminantAnalysis()
    scores = cross_val_score(model, X, y, cv=10)
    return scores.mean()


In [173]:
for k,v in data_sources.items():    
    s = lda_model_score(v)
    print(f'{k} score of {s}')

updrs_sigfall score of 0.9022084195997238
updrs_future_sigfall score of 0.8999999999999998
inc_updrs_sigfall score of 0.9101908657123381
delta_updrs_sigfall score of 0.9118516769012753


In [162]:
for k,v in raw_data_sources.items():    
    s = lda_model_score(v)
    print(f'{k} score of {s}')

updrs_sigfall_raw score of 0.9022084195997238
updrs_future_sigfall_raw score of 0.8999999999999998
inc_updrs_sigfall_raw score of 0.9120275694917821
delta_updrs_sigfall_raw score of 0.9118516769012753


## Using confusion matricies to understand errors
Inbalances in data lead to bias in model
eg if in doubt call it 0

investigate the models again, this time analyzing for ????

In [196]:
def lda_confusion_matrix(dataset):
        X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1], dataset.iloc[:, -1], test_size=0.1)
        lda_model = LinearDiscriminantAnalysis()
        predictions = lda_model.fit(X_train, y_train).predict(X_test)
        tn, fp, fn, tp = confusion_matrix(predictions, y_test.values).ravel()
        return (tn,fp,fn,tp)


def analyse_confusion_matrix(cm):
    tn, fp, fn, tp = cm
    total = (tn + tp)/(fp + fn + tn + tp)
    no_fall_accuracy = tn / (tn + fn)
    fall_accuracy = tp / (tp + fp)
    return (total,no_fall_accuracy,fall_accuracy)


In [199]:
# For each dataset create 10 models (random test/train splits) and record accuracy

for k,v in data_sources.items():
    print(f'--- {k} ---')
    totals = []
    nfs = []
    falls = []
    for n in range(10):
        m = lda_confusion_matrix(v)
        t, nf, f = analyse_confusion_matrix(m)
        totals.append(t)
        nfs.append(nf)
        falls.append(f)
    avg_total = np.array(totals).mean()
    avg_nfs = np.array(nfs).mean()
    avg_falls = np.array(falls).mean()
    print(f'Total {avg_total}\nNo fall {avg_nfs}\nFalls {avg_falls}\n')
    
# to do - stick this in a df

--- updrs_sigfall ---
Total 0.9055555555555556
No fall 0.9758206399176519
Falls 0.1438936063936064

--- updrs_future_sigfall ---
Total 0.908904109589041
No fall 0.9902316458809814
Falls 0.0644083694083694

--- inc_updrs_sigfall ---
Total 0.9098159509202454
No fall 0.9793244806764472
Falls 0.1136620324120324

--- delta_updrs_sigfall ---
Total 0.9178082191780822
No fall 0.9948026186739878
Falls 0.038553113553113555



In [200]:
for k,v in raw_data_sources.items():
    print(f'--- {k} ---')
    totals = []
    nfs = []
    falls = []
    for n in range(10):
        m = lda_confusion_matrix(v)
        t, nf, f = analyse_confusion_matrix(m)
        totals.append(t)
        nfs.append(nf)
        falls.append(f)
    avg_total = np.array(totals).mean()
    avg_nfs = np.array(nfs).mean()
    avg_falls = np.array(falls).mean()
    print(f'Total {avg_total}\nNo fall {avg_nfs}\nFalls {avg_falls}\n')
    
# to do - stick this in a df

--- updrs_sigfall_raw ---
Total 0.912962962962963
No fall 0.9813078782642564
Falls 0.1104227145403616

--- updrs_future_sigfall_raw ---
Total 0.9006849315068493
No fall 0.9812166162556639
Falls 0.09642680848563201

--- inc_updrs_sigfall_raw ---
Total 0.9116564417177914
No fall 0.980012537204393
Falls 0.14357270180799592

--- delta_updrs_sigfall_raw ---
Total 0.9157534246575342
No fall 0.9969810040705562
Falls 0.03864468864468864



In [188]:
# Do some Confusion Matrix analysis on loop to allow for analysis
# k-fold instead of train test

## Balanced data set approach
What happens if we balance out the sigfall ratios and recreate our models?
Can we beat random ?

In [217]:
# Get 
sigfall_indexes = updrs_sigfall.index[updrs_sigfall['SIGFALL'] == 0].to_list()
n_falls = len(updrs_sigfall) - len(sigfall_indexes)
drop = len(sigfall_indexes) - n_falls
random.shuffle(sigfall_indexes)

print(sigfall_indexes[0:drop])

# now drop those indexes to create a balanced data set

[1096, 270, 577, 212, 754, 759, 222, 506, 16, 1034, 1191, 1060, 1045, 1164, 906, 588, 1415, 26, 497, 211, 1082, 1248, 1118, 1261, 1277, 1410, 648, 843, 452, 512, 1607, 874, 1147, 1014, 548, 328, 168, 1154, 1080, 2, 985, 480, 1124, 969, 1203, 775, 1027, 1507, 1334, 1108, 745, 1579, 665, 639, 1376, 96, 1081, 122, 1391, 1533, 909, 899, 1271, 279, 1115, 1339, 1033, 1606, 1061, 1288, 1498, 479, 272, 456, 370, 1282, 1448, 520, 664, 721, 10, 1443, 981, 378, 108, 289, 1411, 133, 769, 626, 1526, 1035, 742, 1032, 1316, 508, 907, 644, 751, 1497, 1476, 57, 824, 837, 758, 362, 1169, 187, 982, 1610, 393, 1510, 900, 517, 1229, 1471, 1123, 1070, 542, 27, 183, 4, 1015, 1502, 1289, 359, 794, 1137, 1414, 1025, 854, 566, 82, 462, 79, 390, 1311, 368, 1530, 842, 376, 256, 1117, 1011, 955, 214, 166, 764, 531, 311, 165, 195, 1456, 1583, 1421, 1374, 1090, 523, 1303, 1103, 1325, 167, 1262, 307, 1010, 1438, 159, 1494, 537, 1254, 199, 1508, 45, 992, 403, 560, 1230, 849, 1023, 1249, 322, 424, 510, 1275, 1540, 92, 