In [104]:
import math
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

# short form for now
original_data = np.genfromtxt('../working_data/updrsii_short_form.csv', delimiter=',', skip_header=True)
n_rows, n_columns = original_data.shape

data = original_data[:,0:(n_columns - 1)]
labels = original_data[:,(n_columns - 1)]

scaler = StandardScaler().fit(data)
data_standardized = scaler.transform(data)

cross_fold_validations = StratifiedKFold(n_splits=10, shuffle=True)

overall_results = []

In [106]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

class Model:
    def __init__(self,name,ml_model):
        self.name = name
        self.model = ml_model
        self.f1_scores = []
        self.roc_auc_scores = []
        self.precision_scores = []
        self.recall_scores = []
        
    def add_f1(self,x):
        self.f1_scores.append(x)
        
    def add_roc(self,x):
        self.roc_auc_scores.append(x)
        
    def add_precision(self,x):
        self.precision_scores.append(x)
        
    def add_recall(self,x):
        self.recall_scores.append(x)

def model_runner(model_list):
    for train, test in cross_fold_validations.split(data,labels):
        x_train = data[train]
        x_test = data[test]
        y_train = labels[train]
        y_test = labels[test]

        for m in model_list:
            m.model.fit(x_train,y_train)
            predictions = m.model.predict(x_test)
            roc_auc = roc_auc_score(y_test, predictions)
            f1 = f1_score(y_test,predictions)
            precision = precision_score(y_test, predictions)
            recall = recall_score(y_test, predictions)
            m.add_f1(f1)
            m.add_roc(roc_auc)
            m.add_precision(precision)
            m.add_recall(recall)
            
def model_runner_standardized(model_list):
    for train, test in cross_fold_validations.split(data,labels):
        x_train = data_standardized[train]
        x_test = data_standardized[test]
        y_train = labels[train]
        y_test = labels[test]

        for m in model_list:
            m.model.fit(x_train,y_train)
            predictions = m.model.predict(x_test)
            roc_auc = roc_auc_score(y_test, predictions)
            f1 = f1_score(y_test,predictions)
            precision = precision_score(y_test, predictions)
            recall = recall_score(y_test, predictions)
            m.add_f1(f1)
            m.add_roc(roc_auc)
            m.add_precision(precision)
            m.add_recall(recall)
        

In [81]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb_models = [ 
              Model('GNB1',GaussianNB(priors=[0.1,0.9])), 
              Model('GNB2', GaussianNB(priors=[0.2,0.8])),
              Model('GNB3', GaussianNB(priors=[0.3,0.7])),
              Model('GNB4', GaussianNB(priors=[0.4,0.6])),
              Model('GNB5', GaussianNB(priors=[0.5,0.5])),
              Model('GNB6', GaussianNB(priors=[0.6,0.4])),
              Model('GNB7', GaussianNB(priors=[0.7,0.3])),
              Model('GNB8', GaussianNB(priors=[0.8,0.2])),
              Model('GNB9', GaussianNB(priors=[0.9,0.1])),
             ]
model_runner(gnb_models)


In [101]:
gnb_models[8].recall_scores
# need some selection method
# try plotting this f1 vs roc; prec vs recall
# average statistics ?

[0.16666666666666666,
 0.6666666666666666,
 0.3333333333333333,
 0.3333333333333333,
 0.4,
 0.0,
 0.0,
 0.2,
 0.8,
 0.4]

In [108]:
# Linear Discriminant Analysis

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_models = [
    Model('LDA1', LinearDiscriminantAnalysis(solver='svd', priors=[0.25,0.75])),
    Model('LDA2', LinearDiscriminantAnalysis(solver='svd', priors=[0.5,0.5])),
    Model('LDA3', LinearDiscriminantAnalysis(solver='svd', priors=[0.75,0.25])),
    Model('LDA4', LinearDiscriminantAnalysis(solver='eigen', priors=[0.25,0.75])),
    Model('LDA5', LinearDiscriminantAnalysis(solver='eigen', priors=[0.5,0.5])),
    Model('LDA6', LinearDiscriminantAnalysis(solver='eigen', priors=[0.75,0.25])),
    Model('LDA7', LinearDiscriminantAnalysis(solver='lsqr', priors=[0.25,0.75])),
    Model('LDA8', LinearDiscriminantAnalysis(solver='lsqr', priors=[0.5,0.5])),
    Model('LDA9', LinearDiscriminantAnalysis(solver='lsqr', priors=[0.75,0.25])),
]

model_runner_standardized(lda_models)

In [115]:
lda_models[4].f1_scores

[0.0,
 0.4,
 0.16666666666666666,
 0.4615384615384615,
 0.25000000000000006,
 0.33333333333333337,
 0.3157894736842105,
 0.3076923076923077,
 0.35294117647058826,
 0.37499999999999994]

In [None]:
# Quadratic Discriminant Analysis

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda_models = 

In [48]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
