In [1]:
# Import functions from diabetes_predictor library
from diabetes_predictor.data_preprocessing import load_data, clean_data, encode_ethnicity, binary_variables_gender
from diabetes_predictor.model_and_prediction import split_data, train_model, predict, calculate_accuracy, predict_proba, calculate_roc_auc

In [3]:

# load the dataframe
df = load_data()

# remove NaNs and impute missing values with the corresponding mean
df = clean_data(df)

# Hot encode for 'ethnicity'
df = encode_ethnicity(df)

# Create a binary variable for 'gender'
df = binary_variables_gender(df)

# split the dataframe into train and test
X_train, X_test, y_train, y_test = split_data(df)

# train a Random Forest Classifier model
features = ['age', 'height', 'weight', 'aids', 'cirrhosis', 'hepatic_failure', 
            'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']

X_train_selected = X_train[features]
X_test_selected = X_test[features]

model = train_model(X_train_selected, y_train)

# make predictions with our model
y_pred = predict(model, X_test_selected)

# compute model accuracy
accuracy = calculate_accuracy(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# predicted probabilities 
train_proba = predict_proba(model, X_train_selected)
X_train['predictions_train'] = train_proba

test_proba = predict_proba(model, X_test_selected)
X_test['predictions_test'] = test_proba

# visualize probabilities 
print("Train set predictions:")
print(X_train[['predictions_train']].head())

print("Test set predictions:")
print(X_test[['predictions_test']].head())

# compute ROC_AUC metric using predicted probabilities
train_roc_auc = calculate_roc_auc(model, X_train_selected, y_train)
test_roc_auc = calculate_roc_auc(model, X_test_selected, y_test)

# print ROC AUC results
print(f'Train ROC AUC: {train_roc_auc:.4f}')
print(f'Test ROC AUC: {test_roc_auc:.4f}')


Accuracy: 0.7264
Train set predictions:
      predictions_train
952            0.748524
4283           0.190000
5521           0.070000
9655           0.110000
3549           0.110000
Test set predictions:
      predictions_test
136              0.430
4882             0.260
2313             0.050
1886             0.066
3000             0.520
Train ROC AUC: 0.9983
Test ROC AUC: 0.6025
