## Import modules

In [None]:
import pdb
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import nhanes as nhanes

%matplotlib notebook

## Settings

In [None]:
DATA_PATH = '/home/mohammad/Database/CDC/NHANES/'
DATASET = 'diabetes' # 'diabetes', 'hypertension', 'heart', 'arthritis'

### Note: 
The code below loads each dataset: dataset_features, dataset_targets, dataset_costs

Here, all datasets are defined explicitly (see nhanes.py) except heart which is using the automated variable selection method.

Running heart experiment for the first time takes longer; however, a cache file will be produced which accelerates next runs. The cache file can also be used to extract merged variable data instead of using raw files.

In [None]:
if DATASET == 'diabetes':
    ds = nhanes.Dataset(DATA_PATH)
    ds.load_diabetes()
    n_fe = ds.features.shape[1]
    n_classes = 3
    dataset_costs = ds.costs.reshape(1,-1)
    dataset_features = ds.features
    dataset_targets = ds.targets
elif DATASET == 'hypertension':
    ds = nhanes.Dataset(DATA_PATH)
    ds.load_hypertension()
    n_fe = ds.features.shape[1]
    n_classes = 2
    dataset_costs = ds.costs.reshape(1,-1)
    dataset_features = ds.features
    dataset_targets = ds.targets
elif DATASET == 'heart':
    ds = nhanes.NHANES(DATA_PATH, None)
    ds.index(False)
    MISSING_THRESHOLD = 0.05
    MUINFO_THRESHOLD = 0.0
    target_col = ['MCQ160B', 'MCQ160C','MCQ160E','MCQ160F']
    exclude_cols = target_col + ['MCQ180B', 'MCQ180C', 'MCQ180E', 'MCQ180F']
    include_cols = None
    def fn_any(df_cols, threshold):
        return (df_cols < threshold).any(axis=1).astype(np.int)
    ds.process_supervised(target_col, exclude_cols, include_cols,
                                  fn_any, 1.5, 
                                  MISSING_THRESHOLD, MUINFO_THRESHOLD)
    n_fe = ds.df_features.values.shape[1]
    n_classes = 2
    dataset_costs = ds.costs.reshape(1,-1)
    dataset_features = ds.df_features.values
    dataset_targets = ds.df_targets.values
elif DATASET == 'arthritis':
    ds = nhanes.Dataset(DATA_PATH)
    ds.load_arthritis()
    n_fe = ds.features.shape[1]
    n_classes = 2
    dataset_costs = ds.costs.reshape(1,-1)
    dataset_features = ds.features
    dataset_targets = ds.targets

## Train/Test Separation

In [None]:
perm = np.random.permutation(dataset_targets.shape[0])
dataset_features = dataset_features[perm]
dataset_targets = dataset_targets[perm]

def get_batch(n_size, phase):
    # select indices
    n_samples = dataset_features.shape[0]
    n_classes = int(dataset_targets.max() + 1)
    if phase == 'test':
        inds_sel = np.arange(0, int(n_samples*0.15), 1)
    elif phase == 'validation':
        n_samples = dataset_features.shape[0]
        inds_sel = np.arange(int(n_samples*0.15), int(n_samples*0.30), 1)
    elif phase == 'train':
        n_samples = dataset_features.shape[0]
        inds_sel = np.arange(int(n_samples*0.30), n_samples, 1)
    else:
        raise NotImplementedError
    inds_sel = np.random.permutation(inds_sel)
    batch_inds = []
    for cl in range(n_classes):
        inds_cl = inds_sel[dataset_targets[inds_sel] == cl]
        batch_inds.extend(inds_cl[:n_size//n_classes])
    batch_inds = np.random.permutation(batch_inds)
    
    return dataset_features[batch_inds], dataset_targets[batch_inds]
    
features_trn, targets_trn = get_batch(n_size=5000, phase='train')
features_tst, targets_tst = get_batch(n_size=1000, phase='test')

## Classification

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('accu_tst_RFC', accu)

clf = SVC()
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('accu_tst_SVC', accu)

clf = LogisticRegression()
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('accu_tst_LR', accu)
