In [4]:
from predictor.preprocessing import DataLoader, DataPreprocessor
from predictor.features_creation import EthnicityEncoder, GenderBinaryEncoder, CategoricalEncoder
from predictor.prediction import DiabetesModel
from sklearn.model_selection import train_test_split

# Load the data
loader = DataLoader()
df = loader.load_data()

# Preprocessing: remove/impute missing values 
preprocessor = DataPreprocessor()
df_clean = preprocessor.remove_nan_rows(df)
df_clean = preprocessor.fill_nan_values(df_clean)

# Features creation
ethnicity_encoder = EthnicityEncoder()
df_encoded = ethnicity_encoder.transform(df_clean)

gender_encoder = GenderBinaryEncoder()
df_encoded = gender_encoder.transform(df_encoded)  

categorical_encoder = CategoricalEncoder()
df_final = categorical_encoder.transform(df_encoded) 

# Divide between train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_final.drop(columns=['diabetes_mellitus']), df_final['diabetes_mellitus'], test_size=0.3, random_state=42)

# Train the Random Forest Classifier model
features = list(X_train.columns)  
target = 'diabetes_mellitus'
model = DiabetesModel(features=features, target=target)
model.train(X_train, y_train)

# Print the predicted probabilities of having diabetes mellitus 
# and compute the ROC_AUC score 
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
roc_auc = model.evaluate(X_test, y_test)
print(f"Test ROC-AUC Score: {roc_auc}")
print(f"Test set predictions:")
print(y_pred_proba)  

Test ROC-AUC Score: 0.8291771566526304
Test set predictions:
[0.42 0.64 0.35 ... 0.14 0.02 0.1 ]
