In [1]:
'''
This script reads a .csv file and generates the Compute Area Under the
Receiver Operating Characteristic Curve (ROC AUC) results from prediction scores.
'''

from classfunc import *

data_dir = 'sample_diabetes_mellitus_data.csv'

# read dataframe
df = read_data(data_dir)

# clean dataframe
print('CLEANING...')
df_cleaned = drop_nan(df, ['age', 'gender', 'ethnicity'])
df_cleaned = fill_nan(df_cleaned, ['height', 'weight'])

# create one hot encoding and binary dummies
print('CREATING DUMMIES...')
df_cleaned_ohe = gen_ohe(df_cleaned, ['ethnicity'])
df_cleaned_ohe_bin = gen_binary(df_cleaned_ohe, ['gender'])

# train RandomForestClassifier model
print('TRAINING...')
feature_cols = ['age', 'height', 'weight', 'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']
target_cols = ['diabetes_mellitus']
results = train_model(df_cleaned_ohe_bin, feature_cols, target_cols)

# get roc_auc score for train and test sets
print('SCORING...')
scores = get_roc_auc_score(results)

# display results
print('Train test scores : {}'.format(scores[0]))
print('Test test scores : {} \nMODEL COMPLETE'.format(scores[1]))


CLEANING...
CREATING DUMMIES...
TRAINING...
SCORING...
Train test scores : 0.08160434621341568
Test test scores : 0.40688859963548424 
MODEL COMPLETE
