In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
sys.path.append('/Users/alexgre/workspace/py3/NAFLD/clean_project/scripts')

In [3]:
from utils import pkl_load, save_models, results2json, load_json
from nafld_config import SEED, DATASET, ROOT, RESULTS_DIR, MODEL_DIR
from experiments import auc_expr, two_results_t_test

2019-04-03 13:46:38 - INFO - sciki-learn package version: 0.20.0


In [4]:
# load X data
ohe_w_homa_dx = pkl_load(f"{DATASET}/ohe_X.pkl")
ohe_wo_homa_dx = pkl_load(f"{DATASET}/ohe_no_homa_X.pkl")
mix_w_homa_dx = pkl_load(f"{DATASET}/mix_X.pkl")
mix_wo_homa_dx = pkl_load(f"{DATASET}/mix_no_homa_X.pkl")

**file name convention:**

```
{model}_{data}_{liver_disease}_{with/no_homa}_(other).{suffix}
```

In [5]:
def run_experiment(clf, tuned_parameters, dx, dy, model_name, dty, disease, homa="wo", score_method='roc_auc'):
    # performance experiment
    optimized_model, best_score, d = auc_expr(clf, tuned_parameters, dx, dy, score_method)
    print(f"5-cv performance: {best_score:.4f}; sensitivity: {d['tpr']:.4f}; specificity: {1-d['fpr']:.4f}")
    print(f"20x5-fold cv mean: {d['cv_mean']:.4f} std: {d['cv_std']:.4f} 95% CI: {d['95ci']}")
    # save model and results
    save_models(optimized_model, f"{model_name}_{dty}_{disease}_{homa}_HOMA_model")
    results2json(d, f"{model_name}_{dty}_{disease}_{homa}_HOMA_results.json")

# Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
tuned_parameters = {'max_iter': [100, 500, 1000, 2000, 5000], 'tol': [0.0001, 0.001, 0.01, 0.1],
                        'random_state': [SEED], 'C': [0.01, 0.1, 1.0, 10.0, 50.0, 100.0, 500.0, 1000.0],
                        'solver': ['lbfgs', 'liblinear', 'newton-cg'], 'class_weight': [None, 'balanced']}

In [8]:
model_name = "LR"

## OHE

In [9]:
dty = "OHE"

### NAFLD

In [10]:
dy = pkl_load(f"{DATASET}/y_nafld.pkl")
disease = "NAFLD"

In [11]:
#with homa auc_roc
# load dataset
'''
5-cv performance: 0.8759; sensitivity: 0.7370; specificity: 0.8699
20x5-fold cv mean: 0.8731 std: 0.0355 95% CI: (0.8660087032065235, 0.8801587773389182)
'''
dx = ohe_w_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, model_name, dty, disease, homa="w", score_method='roc_auc')

2019-04-03 13:47:13 - INFO - total number of parameter combinations searched: 240


Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 1061 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   12.6s finished
2019-04-03 13:47:26 - INFO - best model:
 LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=500,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=13,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
2019-04-03 13:47:26 - INFO - parameters tuning best results (averaged): 0.8797056175744881
2019-04-03 13:47:26 - INFO - 5-fold CV final results on all prediction (not averaged): 0.8759007047272153
2019-04-03 13:47:26 - INFO - seeds: [41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3]


5-cv performance: 0.8759; sensitivity: 0.7370; specificity: 0.8699
20x5-fold cv mean: 0.8731 std: 0.0355 95% CI: (0.8660087032065235, 0.8801587773389182)


In [None]:
# no homa auc_roc
# load dataset
'''
5-cv performance: 0.8657; sensitivity: 0.7139; specificity: 0.8836
20x5-fold cv mean: 0.8632 std: 0.0361 95% CI: (0.8560302511221977, 0.8704303400108072)
'''
dx = ohe_wo_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
# homa: w==with wo==without
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### FIB

In [None]:
dy = pkl_load(f"{DATASET}/y_fib.pkl")
disease = "FIB"

In [None]:
#with homa auc_roc
# load dataset
dx = ohe_w_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
# no homa auc_roc
# load dataset
dx = ohe_wo_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### NASH

In [None]:
dy = pkl_load(f"{DATASET}/y_nash.pkl")
disease = "NASH"

In [None]:
#with homa auc_roc
# load dataset
dx = ohe_w_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
# no homa auc_roc
# load dataset
dx = ohe_wo_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")

two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

## MIX

In [None]:
dty = "MIX"

### NAFLD

In [None]:
dy = pkl_load(f"{DATASET}/y_nafld.pkl")
disease = "NAFLD"

In [None]:
dx = mix_w_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")

two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### FIB

In [None]:
dy = pkl_load(f"{DATASET}/y_fib.pkl")
disease = "FIB"

In [None]:
dx = mix_w_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### NASH

In [None]:
dy = pkl_load(f"{DATASET}/y_nash.pkl")
disease = "NASH"

In [None]:
dx = mix_w_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(LogisticRegression(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

# svm

In [None]:
from sklearn.svm import SVC

In [None]:
tuned_parameters = {"C": [0.001, 0.01, 0.1, 2, 8, 32, 64, 128, 512, 1024, 2048], 
                    'probability':[True], 
                    'tol': [0.1, 0.01, 0.001, 0.0001], 
                    'random_state':[SEED], 'gamma': ['scale', 'auto']}

In [None]:
'''
for svm, only categorical features can be used due to model
'''
model_name = "SVM"
dty = "OHE"

## NAFLD

In [None]:
dy = pkl_load(f"{DATASET}/y_nafld.pkl")
disease = "NAFLD"

In [None]:
dx = ohe_w_homa_dx
run_experiment(SVC(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(SVC(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

## FIB

In [None]:
dy = pkl_load(f"{DATASET}/y_fib.pkl")
disease = "FIB"

In [None]:
dx = ohe_w_homa_dx
run_experiment(SVC(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(SVC(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

## NASH

In [None]:
dy = pkl_load(f"{DATASET}/y_nash.pkl")
disease = "NASH"

In [None]:
dx = ohe_w_homa_dx
run_experiment(SVC(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(SVC(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tuned_parameters = {'criterion':['gini', 'entropy'], 'random_state':[SEED], 'max_depth': [3, 5, 10, 25, 50, None],
                    'max_features':['log2', 'auto', None], 'min_samples_split':[2, 4], 
                    'splitter':['best', 'random'],'min_samples_leaf':[1, 2, 4], 
                    'class_weight': [None, 'balanced'], 
                    'min_impurity_decrease': [0.1, 0.00001, 0.0001, 0.001, 0.01]}
model_name = "DT"

## OHE

In [None]:
dty = "OHE"

### NAFLD

In [None]:
dy = pkl_load(f"{DATASET}/y_nafld.pkl")
disease = "NAFLD"

In [None]:
dx = ohe_w_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### FIB

In [None]:
dy = pkl_load(f"{DATASET}/y_fib.pkl")
disease = "FIB"

In [None]:
dx = ohe_w_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### NASH

In [None]:
dy = pkl_load(f"{DATASET}/y_nash.pkl")
disease = "NASH"

In [None]:
dx = ohe_w_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

## MIX

In [None]:
dty = "MIX"

### NAFLD

In [None]:
dy = pkl_load(f"{DATASET}/y_nafld.pkl")
disease = "NAFLD"

In [None]:
dx = mix_w_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### FIB

In [None]:
dy = pkl_load(f"{DATASET}/y_fib.pkl")
disease = "FIB"

In [None]:
dx = mix_w_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### NASH

In [None]:
dy = pkl_load(f"{DATASET}/y_nash.pkl")
disease = "NASH"

In [None]:
dx = mix_w_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(DecisionTreeClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_name = "RF"

tuned_parameters = {'n_estimators':[10, 50, 100, 500, 1000, 2000, 3000], 'criterion':['gini', 'entropy'], 
                    'random_state':[SEED], 'max_features':['log2', 'auto', None], 
                    'min_samples_split':[2, 4], 'max_depth': [3, 5, 10, 25, 50, None], 
                    'min_samples_leaf':[1,2,4], 
                    'class_weight': [None, 'balanced', 'balanced_subsample']}

## OHE

In [None]:
dty = "OHE"

### NAFLD

In [None]:
dy = pkl_load(f"{DATASET}/y_nafld.pkl")
disease = "NAFLD"

In [None]:
dx = ohe_w_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### FIB

In [None]:
dy = pkl_load(f"{DATASET}/y_fib.pkl")
disease = "FIB"

In [None]:
dx = ohe_w_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### NASH

In [None]:
dy = pkl_load(f"{DATASET}/y_nash.pkl")
disease = "NASH"

In [None]:
dx = ohe_w_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

## MIX

In [None]:
dty = "MIX"

### NAFLD

In [None]:
dy = pkl_load(f"{DATASET}/y_nafld.pkl")
disease = "NAFLD"

In [None]:
dx = mix_w_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### FIB

In [None]:
dy = pkl_load(f"{DATASET}/y_fib.pkl")
disease = "FIB"

In [None]:
dx = mix_w_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### NASH

In [None]:
dy = pkl_load(f"{DATASET}/y_nash.pkl")
disease = "NASH"

In [None]:
dx = mix_w_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(RandomForestClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

# XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
model_name = "XGB"

tuned_parameters = {'random_state':[SEED],  'max_depth': [3, 6, 9, 12, 15], 'learning_rate': [0.01, 0.05,  0.1], 
                    "n_estimators": [100, 500, 1000], 'gamma':[0, 0.5, 1], 
                    'subsample': [0.5, 0.75, 1], 'colsample_bytree': [0.5, 0.75, 1], 
                    'reg_alpha': [0.5, 1, 2],
                    'reg_lambda': [0.5, 1, 2], 'objective':['binary:logistic'], 
                    'tree_method':['exact', 'approx', 'hist']}

## OHE

In [None]:
dty = "OHE"

### NAFLD

In [None]:
dy = pkl_load(f"{DATASET}/y_nafld.pkl")
disease = "NAFLD"

In [None]:
'''
2019-04-03 10:09:14 - INFO - XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=13, reg_alpha=2, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1, tree_method='hist',
       verbosity=0)
2019-04-03 10:09:14 - INFO - 0.8768044839368934
5-cv performance: 0.8750; sensitivity: 0.7572; specificity: 0.8425
20x5-fold cv mean: 0.8729 std: 0.0365 95% CI: (0.8656027431841128, 0.8801666506903788)
'''
dx = ohe_w_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### FIB

In [None]:
dy = pkl_load(f"{DATASET}/y_fib.pkl")
disease = "FIB"

In [None]:
dx = ohe_w_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### NASH

In [None]:
dy = pkl_load(f"{DATASET}/y_nash.pkl")
disease = "NASH"

In [None]:
dx = ohe_w_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = ohe_wo_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

## MIX

In [None]:
dty = "MIX"

### NAFLD

In [None]:
dy = pkl_load(f"{DATASET}/y_nafld.pkl")
disease = "NAFLD"

In [None]:
dx = mix_w_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### FIB

In [None]:
dy = pkl_load(f"{DATASET}/y_fib.pkl")
disease = "FIB"

In [None]:
dx = mix_w_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])

### NASH

In [None]:
dy = pkl_load(f"{DATASET}/y_nash.pkl")
disease = "NASH"

In [None]:
dx = mix_w_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="w", score_method='roc_auc')

In [None]:
dx = mix_wo_homa_dx
run_experiment(XGBClassifier(), tuned_parameters, dx, dy, 
               model_name, dty, disease, homa="wo", score_method='roc_auc')

In [None]:
d_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_w_HOMA_results.json")
d_no_homa = load_json(f"{RESULTS_DIR}/{model_name}_{dty}_{disease}_wo_HOMA_results.json")
two_results_t_test(d_homa['cv_scores'], d_no_homa['cv_scores'])