In [28]:
import math
import numpy as np
import optunity
import optunity.metrics
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import normalize

# short form for now
original_data = np.genfromtxt('../../working_data/updrsii_short_form.csv', delimiter=',', skip_header=True)
n_rows, n_columns = original_data.shape

data = original_data[:,0:(n_columns - 1)]
labels = original_data[:,(n_columns - 1)]

# Common cross validator for all models
cv_decorator = optunity.cross_validated(x=data, y=labels, num_folds=10)

results = []


In [20]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

def gnb_tuned_auroc(x_train, y_train, x_test, y_test, sigfall_prior):
    no_fall_prior = 1.0 - sigfall_prior
    model = GaussianNB(priors=[no_fall_prior,sigfall_prior]).fit(x_train, y_train)
    decision_values = model.predict(x_test)
    auc = optunity.metrics.roc_auc(y_test, decision_values)
    return auc
    
    
gnb_tuned_auroc = cv_decorator(gnb_tuned_auroc)
gnb_optimal_pars, gnb_info, _ = optunity.maximize(gnb_tuned_auroc, solver_name='grid search', num_evals=100, sigfall_prior=[0.01,0.99])

print("Optimal parameters" + str(gnb_optimal_pars))
print("AUROC of tuned model: %1.3f" % gnb_info.optimum)

results.append({'model': 'Gaussian Naive Bayes',
               'Optimal parameters': gnb_optimal_pars,
               'ROC_AUC': gnb_info.optimum
               })

Optimal parameters{'sigfall_prior': 0.024700000000000024}
AUROC of tuned model: 0.633


In [45]:
clf.cv_results_

{'mean_fit_time': array([0.00174379, 0.00102305, 0.00102558, 0.00101542, 0.00106082,
        0.00085592]),
 'std_fit_time': array([4.82738008e-04, 2.21816425e-04, 1.71388144e-04, 2.38362258e-04,
        1.20339529e-04, 2.08574700e-05]),
 'mean_score_time': array([0.00120292, 0.0006485 , 0.00061941, 0.0005538 , 0.00066643,
        0.00045686]),
 'std_score_time': array([5.42386214e-04, 1.35459022e-04, 1.94462822e-04, 8.09670899e-05,
        7.78874344e-05, 6.04687802e-05]),
 'param_priors': masked_array(data=[list([0.01, 0.99]), list([0.2, 0.8]), list([0.3, 0.7]),
                    list([0.7, 0.3]), list([0.8, 0.2]), list([0.01, 0.99])],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'priors': [0.01, 0.99]},
  {'priors': [0.2, 0.8]},
  {'priors': [0.3, 0.7]},
  {'priors': [0.7, 0.3]},
  {'priors': [0.8, 0.2]},
  {'priors': [0.01, 0.99]}],
 'split0_test_score': array([0.1375, 0.175 , 0.5125, 0.8125, 0.8125

In [46]:
# GNB via in house hyperparameter optimzers

from sklearn.model_selection import GridSearchCV

x_train = data[0:400,:]
y_train = labels[0:400]

parameters = {'priors':[[0.01,0.99], [0.2,0.8], [0.3,0.7], [0.7,0.3], [0.8,0.2], [0.01,0.99]]}
gnb = GaussianNB()
clf = GridSearchCV(gnb, parameters)
clf.fit(x_train, y_train)

clf.cv_results_

{'mean_fit_time': array([0.00289783, 0.00142674, 0.00104995, 0.00087023, 0.00102296,
        0.00095797]),
 'std_fit_time': array([7.76019896e-04, 3.82517996e-04, 1.33545685e-04, 5.17318873e-05,
        1.86303555e-04, 1.56258889e-04]),
 'mean_score_time': array([0.00284405, 0.00077724, 0.00056081, 0.00048723, 0.00053992,
        0.00058551]),
 'std_score_time': array([2.04552010e-03, 3.36450230e-04, 1.14658407e-04, 6.32239321e-05,
        4.54142273e-05, 1.27651400e-04]),
 'param_priors': masked_array(data=[list([0.01, 0.99]), list([0.2, 0.8]), list([0.3, 0.7]),
                    list([0.7, 0.3]), list([0.8, 0.2]), list([0.01, 0.99])],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'priors': [0.01, 0.99]},
  {'priors': [0.2, 0.8]},
  {'priors': [0.3, 0.7]},
  {'priors': [0.7, 0.3]},
  {'priors': [0.8, 0.2]},
  {'priors': [0.01, 0.99]}],
 'split0_test_score': array([0.1375, 0.175 , 0.5125, 0.8125, 0.8125

In [29]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

forest_space = { 'criterion': {
        'gini': {'n_estimators': [50,200], 'cwn': [1,3]},
        'entropy': {'n_estimators': [50,200], 'cwn': [1,3]},
        'log_loss': {'n_estimators': [50,200], 'cwn': [1,3]}
    }
}

def forest_tuned_auroc(x_train, y_train, x_test, y_test, criterion, n_estimators, cwn):
    c = int(cwn)
    nest = math.floor(n_estimators)
    if c == 1 :
        class_weight = 'balanced'
    elif c == 2 :
        class_weight = 'balanced_subsample'
    else :
        class_weight = None
        
    model = RandomForestClassifier(criterion=criterion, n_estimators=nest, class_weight = class_weight).fit(x_train, y_train)
    decision_values = model.predict(x_test)
    auc = optunity.metrics.roc_auc(y_test, decision_values)
    return auc

forest_tuned_auroc = cv_decorator(forest_tuned_auroc)
rf_optimal_pairs, rf_info, _ = optunity.maximize_structured(forest_tuned_auroc, search_space=forest_space, num_evals=100)

print("Optimal parameters" + str(rf_optimal_pairs))
print("AUROC of tuned RF: %1.3f" % rf_info.optimum)
    

NameError: name 'rf_optimal_pars' is not defined

In [31]:
# AdaBoost

from sklearn.ensemble import AdaBoostClassifier

def ada_tuned_auroc(x_train, y_train, x_test, y_test, n_estimators, lrate):
    nest = math.floor(n_estimators)
    model = AdaBoostClassifier(n_estimators=nest, learning_rate=lrate).fit(x_train, y_train)
    decision_values = model.predict(x_test)
    auc = optunity.metrics.roc_auc(y_test, decision_values)
    return auc

ada_tuned_auroc = cv_decorator(ada_tuned_auroc)
ada_optimal_pars, ada_info, _ = optunity.maximize(ada_tuned_auroc, solver_name='grid search', num_evals=100, n_estimators=[50,150], lrate=[1, 10])

print("Optimal parameters" + str(ada_optimal_pars))
print("AUROC of tuned model: %1.3f" % ada_info.optimum)

Optimal parameters{'n_estimators': 72.5, 'lrate': 1.045}
AUROC of tuned model: 0.543
