In [1]:
from preprocessing.preprocessing_label_encoding import  get_preprocessed_brfss_dataset_label_encoded_train_test_split, get_preprocessed_brfss_dataset_label_encoded_train_test_split_undersampled, get_preprocessed_brfss_dataset_label_encoded_train_test_split_oversampled

from sklearn.ensemble import RandomForestClassifier
from IPython.core.display_functions import display
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, fbeta_score, recall_score, precision_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
data_train, data_validation, data_test, target_train, target_validation, target_test = get_preprocessed_brfss_dataset_label_encoded_train_test_split(include_test_data=True)
cross_validation = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
data_train.head(10)

Unnamed: 0,GenHealth,PhysHealth,MentHealth,Healthcare,MedCost,Checkup,HighBP,HighChol,HeartAttack,AngiCoro,...,Height,Weight,BMI,Education,Alcohol,Smoking,FruitCons,VegetCons,PhysActivity,Muscles
241445,3.0,0.0,0.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,...,0.855895,0.469484,4.0,3.0,1.0,3.0,2.0,1.0,3.0,2.0
107433,3.0,0.0,0.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,...,0.764192,0.392789,4.0,3.0,1.0,4.0,2.0,1.0,4.0,2.0
255533,2.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,...,0.855895,0.445989,4.0,3.0,1.0,4.0,1.0,2.0,3.0,1.0
31855,1.0,0.0,0.0,1.0,2.0,1.0,3.0,2.0,2.0,2.0,...,0.68559,0.20345,2.0,4.0,1.0,4.0,1.0,1.0,3.0,1.0
76133,2.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,...,0.69869,0.219079,2.0,4.0,1.0,4.0,1.0,1.0,1.0,1.0
93828,2.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,...,0.799127,0.289495,3.0,3.0,1.0,2.0,2.0,1.0,1.0,2.0
71236,2.0,0.0,0.0,1.0,2.0,2.0,3.0,2.0,2.0,2.0,...,0.742358,0.258203,3.0,2.0,1.0,4.0,2.0,1.0,1.0,2.0
212026,3.0,0.1,0.0,1.0,2.0,1.0,3.0,2.0,2.0,2.0,...,0.69869,0.234742,3.0,3.0,1.0,1.0,2.0,1.0,2.0,1.0
169431,3.0,1.0,0.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,...,0.777293,0.33645,4.0,4.0,1.0,3.0,2.0,2.0,3.0,2.0
60906,1.0,0.0,0.0,1.0,2.0,2.0,3.0,2.0,2.0,2.0,...,0.71179,0.211247,2.0,3.0,1.0,3.0,1.0,1.0,1.0,2.0


This function will be used several times later to evaluate the model with the different parameters

In [4]:
def validate_random_forest_classifier(estimator: RandomForestClassifier):
    estimator.fit(data_train, target_train)
    predictions = estimator.predict(data_validation)
    return accuracy_score(target_validation, predictions), fbeta_score(target_validation, predictions, beta=2), precision_score(target_validation, predictions), recall_score(target_validation, predictions)


### Parameter search
First we search for the best max depth

In [25]:
max_depth_parameters = range(2, 28)
for param in max_depth_parameters:
    estimator = RandomForestClassifier(n_jobs=-1, class_weight="balanced", max_depth=param)
    accuracy, f_score, precision, recall = validate_random_forest_classifier(estimator)
    print(f"Max depth {param} --> Accuracy: {accuracy}, F-Score: {f_score}, Precision: {precision}, Recall: {recall}")

Max depth 2 --> Accuracy: 0.72906526450421, F-Score: 0.5817803227485685, Precision: 0.29060273545166154, Recall: 0.7762189192943464
Max depth 3 --> Accuracy: 0.7332397768856361, F-Score: 0.5872826813774656, Precision: 0.2949108079748164, Recall: 0.7808028892901792
Max depth 4 --> Accuracy: 0.7330443741784204, F-Score: 0.5895424291109396, Precision: 0.29538445455020645, Recall: 0.7849701347409362
Max depth 5 --> Accuracy: 0.7261875155433971, F-Score: 0.597120822622108, Precision: 0.2928539008522871, Recall: 0.8066398110848729
Max depth 6 --> Accuracy: 0.7336305823000675, F-Score: 0.5969419596421884, Precision: 0.2977586385804711, Recall: 0.7971940547298236
Max depth 7 --> Accuracy: 0.7340036238320248, F-Score: 0.5999709054635384, Precision: 0.29881488381721266, Recall: 0.8020558410890402
Max depth 8 --> Accuracy: 0.7345543041887235, F-Score: 0.6049290515309934, Precision: 0.3004946413849959, Recall: 0.8101125156271705
Max depth 9 --> Accuracy: 0.7407006075247806, F-Score: 0.602880744716

In [26]:
criterion_parameter = ["gini", "entropy", "log_loss"]
for param in criterion_parameter:
    estimator = RandomForestClassifier(n_jobs=-1, class_weight="balanced", max_depth=8, criterion=param)
    accuracy, f_score, precision, recall = validate_random_forest_classifier(estimator)
    print(f"Criterion {param} --> Accuracy: {accuracy}, F-Score: {f_score}, Precision: {precision}, Recall: {recall}")

Criterion gini --> Accuracy: 0.7364905673784062, F-Score: 0.6047237540318385, Precision: 0.30178098551326654, Recall: 0.8073343519933324
Criterion entropy --> Accuracy: 0.7365438590258286, F-Score: 0.6012842965557502, Precision: 0.30091819699499167, Recall: 0.8012223919988888
Criterion log_loss --> Accuracy: 0.7345720680711977, F-Score: 0.6026586353723127, Precision: 0.2999121401622823, Recall: 0.8060841783581053


In [28]:
number_of_estimators_parameter = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
for param in number_of_estimators_parameter:
    estimator = RandomForestClassifier(n_jobs=-1, class_weight="balanced", max_depth=8, criterion="gini" ,n_estimators=param)
    accuracy, f_score, precision, recall = validate_random_forest_classifier(estimator)
    print(f"Number of estimators {param} --> Accuracy: {accuracy}, F-Score: {f_score}, Precision: {precision}, Recall: {recall}")

Number of estimators 50 --> Accuracy: 0.7385156499804597, F-Score: 0.6025204288490879, Precision: 0.3026295071642261, Recall: 0.8009445756355049
Number of estimators 100 --> Accuracy: 0.737964969623761, F-Score: 0.6016545500125345, Precision: 0.30201342281879195, Recall: 0.8001111265453535
Number of estimators 150 --> Accuracy: 0.7365083312608803, F-Score: 0.6032364211929356, Precision: 0.3014047866805411, Recall: 0.8046950965411863
Number of estimators 200 --> Accuracy: 0.7361885813763456, F-Score: 0.6034841610123632, Precision: 0.30124675324675326, Recall: 0.8055285456313377
Number of estimators 250 --> Accuracy: 0.7354424983124311, F-Score: 0.6035880592050558, Precision: 0.30075616324839444, Recall: 0.8066398110848729
Number of estimators 300 --> Accuracy: 0.737964969623761, F-Score: 0.6029227557411273, Precision: 0.30234505862646566, Recall: 0.8023336574524239
Number of estimators 350 --> Accuracy: 0.7359043592567591, F-Score: 0.6033621837549935, Precision: 0.3010172306414781, Reca

In [31]:
min_samples_split_parameter = range(5, 500)
best_param = [0,0]
for param in min_samples_split_parameter:
    estimator = RandomForestClassifier(n_jobs=-1, class_weight="balanced", max_depth=8, criterion="gini", n_estimators=400 ,min_samples_split=param)
    accuracy, f_score, precision, recall = validate_random_forest_classifier(estimator)
    if f_score > best_param[0]:
        best_param[0] = f_score
        best_param[1] = param
    print(f"Minimum samples for split {param} --> Accuracy: {accuracy}, F-Score: {f_score}, Precision: {precision}, Recall: {recall}")

Minimum samples for split 5 --> Accuracy: 0.7364195118485096, F-Score: 0.6027785299202266, Precision: 0.3012230028623471, Recall: 0.8040005556327268
Minimum samples for split 6 --> Accuracy: 0.7371655949124241, F-Score: 0.6025943150299266, Precision: 0.3016966849386583, Recall: 0.8027503819974997
Minimum samples for split 7 --> Accuracy: 0.7366859700856219, F-Score: 0.6029669139094924, Precision: 0.30145833333333333, Recall: 0.8040005556327268
Minimum samples for split 8 --> Accuracy: 0.7372011226773724, F-Score: 0.6034896083050176, Precision: 0.30195567144719687, Recall: 0.8042783719961105
Minimum samples for split 9 --> Accuracy: 0.7376629836217004, F-Score: 0.603263023930233, Precision: 0.30222106088319833, Recall: 0.8033060147242672
Minimum samples for split 10 --> Accuracy: 0.7378583863289161, F-Score: 0.603322342334822, Precision: 0.30237422863717184, Recall: 0.8031671065425754
Minimum samples for split 11 --> Accuracy: 0.7376629836217004, F-Score: 0.6038959684651803, Precision: 

In [32]:
print(f"Best min_samples_split: {best_param[1]} with f2-score {best_param[0]}")

Best min_samples_split: 204 with f2-score 0.6062428704759928


In [34]:
min_samples_leaf_parameter = range(5, 500)
best_param = [0,0]
for param in min_samples_leaf_parameter:
    estimator = RandomForestClassifier(n_jobs=-1, class_weight="balanced", max_depth=8, criterion="gini", n_estimators=400, min_samples_split= 204, min_samples_leaf=param)
    accuracy, f_score, precision, recall = validate_random_forest_classifier(estimator)
    if f_score > best_param[0]:
        best_param[0] = f_score
        best_param[1] = param
    print(f"Minimum samples for leaf {param} --> Accuracy: {accuracy}, F-Score: {f_score}, Precision: {precision}, Recall: {recall}")

Minimum samples for leaf 5 --> Accuracy: 0.73693466444026, F-Score: 0.6044070479443496, Precision: 0.30200853366635444, Recall: 0.8062230865397972
Minimum samples for leaf 6 --> Accuracy: 0.7335062351227484, F-Score: 0.6033249725337368, Precision: 0.29935716122396505, Recall: 0.8085845256285595
Minimum samples for leaf 7 --> Accuracy: 0.735655664902121, F-Score: 0.6022391476255878, Precision: 0.30055042060442416, Recall: 0.8040005556327268
Minimum samples for leaf 8 --> Accuracy: 0.7357977759619142, F-Score: 0.6049426338543399, Precision: 0.30135638848622903, Recall: 0.8085845256285595
Minimum samples for leaf 9 --> Accuracy: 0.7354424983124311, F-Score: 0.6045337433510638, Precision: 0.3010035174839644, Recall: 0.8083067092651757
Minimum samples for leaf 10 --> Accuracy: 0.7343233737165595, F-Score: 0.6030348950657007, Precision: 0.2998400165144243, Recall: 0.8070565356299486
Minimum samples for leaf 11 --> Accuracy: 0.7357444843144918, F-Score: 0.6030124617721098, Precision: 0.300814

In [36]:
print(f"Best min_samples_leaf: {best_param[1]} with f2-score {best_param[0]}")

Best min_samples_leaf: 8 with f2-score 0.6049426338543399


Validate best model against undersampling/oversampling

In [41]:
data_train, data_validation, data_test, target_train, target_validation, target_test = get_preprocessed_brfss_dataset_label_encoded_train_test_split_undersampled(include_test_data=True)
estimator = RandomForestClassifier(n_jobs=-1, max_depth=8, criterion="gini", n_estimators=400, min_samples_split= 204, min_samples_leaf=8)
accuracy, f_score, precision, recall = validate_random_forest_classifier(estimator)
print(f"Best model with undersampling --> Accuracy: {accuracy}, F-Score: {f_score}, Precision: {precision}, Recall: {recall}")

Best model with undersampling --> Accuracy: 0.7276796816712261, F-Score: 0.601439588688946, Precision: 0.29497201069141155, Recall: 0.8124739547159328


In [42]:
data_train, data_validation, data_test, target_train, target_validation, target_test = get_preprocessed_brfss_dataset_label_encoded_train_test_split_oversampled(include_test_data=True)
estimator = RandomForestClassifier(n_jobs=-1, max_depth=8, criterion="gini", n_estimators=400, min_samples_split= 204, min_samples_leaf=8)
accuracy, f_score, precision, recall = validate_random_forest_classifier(estimator)
print(f"Best model with oversampling --> Accuracy: {accuracy}, F-Score: {f_score}, Precision: {precision}, Recall: {recall}")

Best model with oversampling --> Accuracy: 0.7325647493516183, F-Score: 0.6046251861658116, Precision: 0.2990587272355228, Recall: 0.8120572301708571


Validate model on test data

In [43]:
data_train, data_validation, data_test, target_train, target_validation, target_test = get_preprocessed_brfss_dataset_label_encoded_train_test_split(include_test_data=True)

estimator = RandomForestClassifier(n_jobs=-1, class_weight="balanced", max_depth=8, criterion="gini", n_estimators=400, min_samples_split= 204, min_samples_leaf=8)
estimator.fit(data_train, target_train)
predictions = estimator.predict(data_test)

accuracy, f_score, precision, recall = accuracy_score(target_test, predictions), fbeta_score(target_test, predictions, beta=2), precision_score(target_test, predictions), recall_score(target_test, predictions)
print(f"Model performance on test data --> Accuracy: {accuracy}, F-Score: {f_score}, Precision: {precision}, Recall: {recall}")

Model performance on test data --> Accuracy: 0.7345412558841815, F-Score: 0.6011349227795215, Precision: 0.2994873919121835, Recall: 0.8034449229059591
