In [8]:
import math
import json
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, normalize

original_data = np.genfromtxt('../working_data/updrsii_all_q_p_s.csv', delimiter=',', skip_header=True)
n_rows, n_columns = original_data.shape

data = original_data[:,0:(n_columns - 1)]
labels = original_data[:,(n_columns - 1)]

scaler = StandardScaler().fit(data)
data_standardized = scaler.transform(data)

data_normalized = normalize(data)

cross_fold_validations = StratifiedKFold(n_splits=10, shuffle=True)


In [9]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

class Model:
    def __init__(self,name,ml_model):
        self.name = name
        self.model = ml_model
        self.f1_scores = []
        self.roc_auc_scores = []
        self.precision_scores = []
        self.recall_scores = []
        
    def add_f1(self,x):
        self.f1_scores.append(x)
        
    def add_roc(self,x):
        self.roc_auc_scores.append(x)
        
    def add_precision(self,x):
        self.precision_scores.append(x)
        
    def add_recall(self,x):
        self.recall_scores.append(x)
        
    def summary(self):
        f1_mean = np.mean(self.f1_scores)
        f1_std = np.std(self.f1_scores)
        roc_mean = np.mean(self.roc_auc_scores)
        roc_std = np.std(self.roc_auc_scores)
        prec_mean = np.mean(self.precision_scores)
        prec_std = np.std(self.precision_scores)
        rec_mean = np.mean(self.recall_scores)
        rec_std = np.std(self.recall_scores)
        return {self.name : {
            'f1_mean' : f1_mean,
            'f1_std' : f1_std,
            'roc_mean' : roc_mean,
            'roc_std' : roc_std,
            'precision_mean' : prec_mean,
            'precision_std' : prec_std,
            'recall_mean' : rec_mean,
            'recall_std' : rec_std
        }}

def model_runner(model_list):
    for train, test in cross_fold_validations.split(data,labels):
        x_train = data[train]
        x_test = data[test]
        y_train = labels[train]
        y_test = labels[test]

        for m in model_list:
            m.model.fit(x_train,y_train)
            predictions = m.model.predict(x_test)
            roc_auc = roc_auc_score(y_test, predictions)
            f1 = f1_score(y_test,predictions)
            precision = precision_score(y_test, predictions, zero_division=0)
            recall = recall_score(y_test, predictions)
            m.add_f1(f1)
            m.add_roc(roc_auc)
            m.add_precision(precision)
            m.add_recall(recall)
            
def model_runner_standardized(model_list):
    for train, test in cross_fold_validations.split(data,labels):
        x_train = data_standardized[train]
        x_test = data_standardized[test]
        y_train = labels[train]
        y_test = labels[test]

        for m in model_list:
            m.model.fit(x_train,y_train)
            predictions = m.model.predict(x_test)
            roc_auc = roc_auc_score(y_test, predictions)
            f1 = f1_score(y_test,predictions)
            precision = precision_score(y_test, predictions, zero_division=0)
            recall = recall_score(y_test, predictions)
            m.add_f1(f1)
            m.add_roc(roc_auc)
            m.add_precision(precision)
            m.add_recall(recall)
 
def model_runner_normalized(model_list):
    for train, test in cross_fold_validations.split(data,labels):
        x_train = data_normalized[train]
        x_test = data_normalized[test]
        y_train = labels[train]
        y_test = labels[test]

        for m in model_list:
            m.model.fit(x_train,y_train)
            predictions = m.model.predict(x_test)
            roc_auc = roc_auc_score(y_test, predictions)
            f1 = f1_score(y_test,predictions)
            precision = precision_score(y_test, predictions, zero_division=0)
            recall = recall_score(y_test, predictions)
            m.add_f1(f1)
            m.add_roc(roc_auc)
            m.add_precision(precision)
            m.add_recall(recall)

In [10]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb_models = [ 
              Model('GNB1',GaussianNB(priors=[0.1,0.9])), 
              Model('GNB2', GaussianNB(priors=[0.2,0.8])),
              Model('GNB3', GaussianNB(priors=[0.3,0.7])),
              Model('GNB4', GaussianNB(priors=[0.4,0.6])),
              Model('GNB5', GaussianNB(priors=[0.5,0.5])),
              Model('GNB6', GaussianNB(priors=[0.6,0.4])),
              Model('GNB7', GaussianNB(priors=[0.7,0.3])),
              Model('GNB8', GaussianNB(priors=[0.8,0.2])),
              Model('GNB9', GaussianNB(priors=[0.9,0.1])),
             ]
model_runner(gnb_models)


In [12]:
# Linear Discriminant Analysis

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_models = [
    Model('LDA1', LinearDiscriminantAnalysis(solver='svd', priors=[0.25,0.75])),
    Model('LDA2', LinearDiscriminantAnalysis(solver='svd', priors=[0.5,0.5])),
    Model('LDA3', LinearDiscriminantAnalysis(solver='svd', priors=[0.75,0.25])),
    # Model('LDA4', LinearDiscriminantAnalysis(solver='eigen', priors=[0.25,0.75])),
    # Model('LDA5', LinearDiscriminantAnalysis(solver='eigen', priors=[0.5,0.5])),
    # Model('LDA6', LinearDiscriminantAnalysis(solver='eigen', priors=[0.75,0.25])),
    Model('LDA7', LinearDiscriminantAnalysis(solver='lsqr', priors=[0.25,0.75])),
    Model('LDA8', LinearDiscriminantAnalysis(solver='lsqr', priors=[0.5,0.5])),
    Model('LDA9', LinearDiscriminantAnalysis(solver='lsqr', priors=[0.75,0.25])),
]

model_runner_standardized(lda_models)

In [13]:
# Quadratic Discriminant Analysis

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda_models = [ 
              Model('QDA1',QuadraticDiscriminantAnalysis(priors=[0.1,0.9])), 
              Model('QDA2', QuadraticDiscriminantAnalysis(priors=[0.2,0.8])),
              Model('QDA3', QuadraticDiscriminantAnalysis(priors=[0.3,0.7])),
              Model('QDA4', QuadraticDiscriminantAnalysis(priors=[0.4,0.6])),
              Model('QDA5', QuadraticDiscriminantAnalysis(priors=[0.5,0.5])),
              Model('QDA6', QuadraticDiscriminantAnalysis(priors=[0.6,0.4])),
              Model('QDA7', QuadraticDiscriminantAnalysis(priors=[0.7,0.3])),
              Model('QDA8', QuadraticDiscriminantAnalysis(priors=[0.8,0.2])),
              Model('QDA9', QuadraticDiscriminantAnalysis(priors=[0.9,0.1])),
             ]

model_runner_standardized(qda_models)



In [22]:
# Support Vector Machines
from sklearn.svm import SVC

svm_models = [
    Model('SVM1', SVC(kernel='linear', class_weight=None)),
    Model('SVM2', SVC(kernel='linear', class_weight='balanced')),
    Model('SVM3', SVC(kernel='rbf', class_weight=None)),
    Model('SVM4', SVC(kernel='rbf', class_weight='balanced')),
    Model('SVM5', SVC(kernel='poly', class_weight=None, degree=3)),
    Model('SVM6', SVC(kernel='poly', class_weight=None, degree=4)),
    Model('SVM7', SVC(kernel='poly', class_weight=None, degree=5)),
    Model('SVM8', SVC(kernel='poly', class_weight=None, degree=6)),
    Model('SVM9', SVC(kernel='poly', class_weight=None, degree=7)),
    Model('SVM10', SVC(kernel='poly', class_weight='balanced', degree=3)),
    Model('SVM11', SVC(kernel='poly', class_weight='balanced', degree=4)),
    Model('SVM12', SVC(kernel='poly', class_weight='balanced', degree=5)),
    Model('SVM13', SVC(kernel='poly', class_weight='balanced', degree=6)),
    Model('SVM14', SVC(kernel='poly', class_weight='balanced', degree=7)),
    Model('SVMx', SVC(kernel='poly', class_weight=None, degree=8)),
    Model('SVMxx', SVC(kernel='poly', class_weight=None, degree=9)),
]

model_runner_normalized(svm_models)

In [25]:
svm_models[15].summary()

{'SVMxx': {'f1_mean': 0.13191919191919194,
  'f1_std': 0.11514460372666482,
  'roc_mean': 0.5299452457264957,
  'roc_std': 0.04222253659466368,
  'precision_mean': 0.33333333333333337,
  'precision_std': 0.32489314482696546,
  'recall_mean': 0.08472222222222223,
  'recall_std': 0.07465197027987047}}

In [15]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf_models = [
    Model('RF1',RandomForestClassifier(n_estimators=50, class_weight='balanced')),
    Model('RF2',RandomForestClassifier(n_estimators=100, class_weight='balanced')),
    Model('RF3',RandomForestClassifier(n_estimators=150, class_weight='balanced')),
    Model('RF4',RandomForestClassifier(n_estimators=200, class_weight='balanced')),
    Model('RF5',RandomForestClassifier(n_estimators=50, class_weight='balanced_subsample')),
    Model('RF6',RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample')),
    Model('RF7',RandomForestClassifier(n_estimators=150, class_weight='balanced_subsample')),
    Model('RF8',RandomForestClassifier(n_estimators=200, class_weight='balanced_subsample'))    
]

model_runner(rf_models)

In [16]:
# AdaBoost

from sklearn.ensemble import AdaBoostClassifier

ada_models = [
    Model('ADA1', AdaBoostClassifier(n_estimators=100)),
    Model('ADA2', AdaBoostClassifier(n_estimators=150)),
    Model('ADA3', AdaBoostClassifier(n_estimators=200)),
    Model('ADA4', AdaBoostClassifier(n_estimators=250)),
    Model('ADA5', AdaBoostClassifier(n_estimators=300)),
    Model('ADA6', AdaBoostClassifier(n_estimators=350)),
    Model('ADA7', AdaBoostClassifier(n_estimators=400)),
]

model_runner(ada_models)

In [17]:
# XGBoost
import xgboost as xgb

xgb_models = [
    Model('XGB1',xgb.XGBRFClassifier(booster='gbtree')),
    Model('XGB2',xgb.XGBRFClassifier(booster='gblinear'))
]

model_runner(xgb_models)

Parameters: { "colsample_bynode", "num_parallel_tree", "subsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "colsample_bynode", "num_parallel_tree", "subsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "colsample_bynode", "num_parallel_tree", "subsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting fl

In [18]:
from sklearn.linear_model import LogisticRegression

logreg_models = [
    Model('LOGREG2', LogisticRegression(multi_class='ovr', max_iter=2000, penalty='l1', class_weight=None, solver='liblinear')),
    Model('LOGREG4', LogisticRegression(multi_class='ovr', max_iter=2000, penalty='l1', class_weight='balanced', solver='liblinear')),
    Model('LOGREG5', LogisticRegression(multi_class='ovr', max_iter=2000, penalty='l2', class_weight=None, solver='lbfgs')),
    Model('LOGREG6', LogisticRegression(multi_class='ovr', max_iter=2000, penalty='l2', class_weight=None, solver='liblinear')),
    Model('LOGREG7', LogisticRegression(multi_class='ovr', max_iter=2000, penalty='l2', class_weight='balanced', solver='lbfgs')),
    Model('LOGREG8', LogisticRegression(multi_class='ovr', max_iter=2000, penalty='l2', class_weight='balanced', solver='liblinear')),
    Model('LOGREG10', LogisticRegression(multi_class='ovr', max_iter=2000, penalty='elasticnet', class_weight=None, solver='saga', l1_ratio=0.5)),
    Model('LOGREG12', LogisticRegression(multi_class='ovr', max_iter=2000, penalty='elasticnet', class_weight='balanced', solver='saga', l1_ratio=0.5))
]

model_runner_standardized(logreg_models)

In [19]:
# Write models to JSON for analysis in results section

overall_results = '['

for m in gnb_models:
    overall_results = overall_results + json.dumps(m.summary()) + ','

for m in lda_models:
    overall_results = overall_results + json.dumps(m.summary()) + ','
    
for m in qda_models:
    overall_results = overall_results + json.dumps(m.summary()) + ','
    
for m in svm_models:
    overall_results = overall_results + json.dumps(m.summary()) + ','
    
for m in rf_models:
    overall_results = overall_results + json.dumps(m.summary()) + ','
    
for m in ada_models:
    overall_results = overall_results + json.dumps(m.summary()) + ','
    
for m in xgb_models:
    overall_results = overall_results + json.dumps(m.summary()) + ','
    
for m in logreg_models:
    overall_results = overall_results + json.dumps(m.summary()) + ','

overall_results = overall_results.rstrip(overall_results[-1])
overall_results = overall_results + ']'

with open('../results/uii_all_q_p.json', 'w') as outfile:
    outfile.write(overall_results)

In [20]:
# Ensemble learner

from sklearn.ensemble import VotingClassifier

models = list()
models.append(('LDA3', LinearDiscriminantAnalysis(solver='svd', priors=[0.75,0.25])))
models.append(('QDA9', QuadraticDiscriminantAnalysis(priors=[0.9,0.1])))
models.append(('LOGREG4', LogisticRegression(multi_class='ovr', max_iter=2000, penalty='l1', class_weight='balanced', solver='liblinear')))

# define the hard voting ensemble
ensemble = [Model('quorum',VotingClassifier(estimators=models, voting='hard'))]

model_runner_standardized(ensemble)

ensemble[0].summary()



{'quorum': {'f1_mean': 0.36150197628458497,
  'f1_std': 0.16934759032975893,
  'roc_mean': 0.6588888888888889,
  'roc_std': 0.11709994053385807,
  'precision_mean': 0.3283882783882784,
  'precision_std': 0.13709270792064385,
  'recall_mean': 0.42777777777777776,
  'recall_std': 0.2326378524023345}}

In [21]:
with open('../results/uii_all_q_p_ensemble.json', 'w') as outfile:
    outfile.write(json.dumps(ensemble[0].summary()))