In [1]:
## Reading the preprocessed dataset 
import pandas as pd
df = pd.read_csv(r'/Users/noortje/Documents/880502-M-18 | Master Thesis/Python/Dataset RQ4.csv')

## Reviewing the loaded dataset
df

Unnamed: 0,male,education,distance_store,health_status
0,805.0,0.093,0.269,0
1,1213.0,0.281,0.601,0
2,930.0,0.299,0.310,0
3,913.0,0.190,0.396,0
4,788.0,0.296,0.222,0
...,...,...,...,...
4831,989.0,0.138,0.418,1
4832,856.0,0.093,0.299,1
4833,729.0,0.351,0.775,1
4834,863.0,0.257,0.256,1


In [2]:
## RANDOM FOREST: IMPLEMENTATION FOR RQ4
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, balanced_accuracy_score
from sklearn.preprocessing import RobustScaler
import numpy as np

## Assign data to X and Y
X = df.drop('health_status', axis=1).values
Y = df['health_status'].values

## Splitting the data into a training and testing set
X_main, X_test, y_main, y_test = train_test_split(X, Y, 
test_size = 0.37, random_state = 101)

## Splitting the main set into a training and validation set
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, 
test_size = 0.23, random_state = 101)

## Feature scaling using RobustScaler to put all features to the same scale
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

## Assigning sample weights
weights = np.zeros(len(y_train))
weights[y_train == 1] = 0.5
weights[y_train == 0] = 0.5

## Defining and fitting the model
model = RandomForestClassifier(random_state=101, n_estimators = 60,
min_samples_split = 70, min_samples_leaf = 40, max_samples = 100,
max_features = 'sqrt', max_depth = 10)

model.fit(X_train_scaled, y_train, sample_weight = weights)

## Making predictions for every partition of the dataset 
pred_val = model.predict(X_val_scaled)
pred_train = model.predict(X_train_scaled)

## Evaluating the model performance
print('Balanced accuracy on validation data =', balanced_accuracy_score(y_val, pred_val))
print('Recall-score (positive class) on validation data =', recall_score(y_val, pred_val, pos_label = 1))
print('----------------------------------------------------------------------------------------------')
print('Checking for model overfitting by computing the evaluation metric on training data:')
print('-----------------')
print('Balanced accuracy on training data =', balanced_accuracy_score(y_train, pred_train))
print('Recall-score (positive class) on training data =', recall_score(y_train, pred_train, pos_label = 1))


Balanced accuracy on validation data = 0.5573513610315186
Recall-score (positive class) on validation data = 0.65625
----------------------------------------------------------------------------------------------
Checking for model overfitting by computing the evaluation metric on training data:
-----------------
Balanced accuracy on training data = 0.5666938782637546
Recall-score (positive class) on training data = 0.6775818639798489


In [3]:
## FINAL TRAINING ITERATION FOR RF - RQ4
## Feature scaling using RobustScaler to put all features to the same scale
scaler = RobustScaler()
X_main_scaled = scaler.fit_transform(X_main)
X_test_scaled = scaler.transform(X_test)

## Assigning sample weights
weights = np.zeros(len(y_train))
weights[y_train == 1] = 0.5
weights[y_train == 0] = 0.5

## Defining and fitting the model
model = RandomForestClassifier(random_state=101, n_estimators = 60,
min_samples_split = 70, min_samples_leaf = 40, max_samples = 100,
max_features = 'sqrt', max_depth = 10)

model.fit(X_train_scaled, y_train, sample_weight = weights)

## Making predictions for the testing partition of the dataset
pred_test = model.predict(X_test_scaled)
pred_train = model.predict(X_main_scaled)

## Unbiased estimate on unseen data
print('Checking the final model performance by computing the evaluation metric on unseen testing data:')
print('-----------------')
print('Balanced accuracy on testing data =', balanced_accuracy_score(y_test, pred_test))
print('Recall-score (positive class) on testing data =', recall_score(y_test, pred_test, pos_label = 1))
print('-----------------')
print('Balanced accuracy on training data =', balanced_accuracy_score(y_main, pred_train))
print('Recall-score (positive class) on training data =', recall_score(y_main, pred_train, pos_label = 1))

Checking the final model performance by computing the evaluation metric on unseen testing data:
-----------------
Balanced accuracy on testing data = 0.5305074160811866
Recall-score (positive class) on testing data = 0.6457142857142857
-----------------
Balanced accuracy on training data = 0.5635872346902652
Recall-score (positive class) on training data = 0.6714193130265717


In [55]:
## HYPERPARAMETER TUNING PROCESS FOR RQ4 - RANDOM FOREST
## Source of code: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74 
## Random search iteration 1: 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import numpy as np

## Creating the random grid
random_grid = {'n_estimators': [int(x) for x in np.linspace(start=50, stop = 500, num = 10)],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [int(x) for x in np.linspace(start=3, stop = 15, num = 5)],
            'min_samples_split': [int(x) for x in np.linspace(start=1, stop = 50, num = 5)],
            'min_samples_leaf': [int(x) for x in np.linspace(start=1, stop = 50, num = 10)],
            'max_samples': [int(x) for x in np.linspace(start=100, stop = 1000, num = 10)]}

## Creating the model to tune
rf = RandomForestClassifier(random_state=101)

## Random search procedure of parameters, using 3 fold cross validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=101, n_jobs = -1)

## Fitting the random search model to the previously defined resampled dataset using SMOTE
rf_random.fit(X_train_scaled, y_train, sample_weight = weights)

## Retrieving the best model parameters according to this iteration
rf_random.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=15, max_features=log2, max_samples=200, min_samples_leaf=44, min_samples_split=13, n_estimators=500; total time=   2.5s
[CV] END max_depth=15, max_features=log2, max_samples=200, min_samples_leaf=44, min_samples_split=13, n_estimators=500; total time=   2.5s
[CV] END max_depth=15, max_features=log2, max_samples=200, min_samples_leaf=44, min_samples_split=13, n_estimators=500; total time=   2.4s
[CV] END max_depth=15, max_features=log2, max_samples=200, min_samples_leaf=44, min_samples_split=13, n_estimators=500; total time=   2.5s
[CV] END max_depth=6, max_features=log2, max_samples=600, min_samples_leaf=44, min_samples_split=37, n_estimators=100; total time=   0.6s
[CV] END max_depth=6, max_features=log2, max_samples=600, min_samples_leaf=44, min_samples_split=37, n_estimators=100; total time=   0.6s
[CV] END max_depth=6, max_features=log2, max_samples=600, min_samples_leaf=44, min_samples_split=37, n_esti

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/noortje/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/noortje/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/Users/noortje/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/noortje/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch

{'n_estimators': 300,
 'min_samples_split': 50,
 'min_samples_leaf': 17,
 'max_samples': 200,
 'max_features': 'log2',
 'max_depth': 3}

In [20]:
## HYPERPARAMETER TUNING PROCESS FOR RQ4 - RANDOM FOREST
## Source of code: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74 
## Random search iteration 2: 
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import numpy as np

## Creating the random grid
random_grid = {'n_estimators': [int(x) for x in np.linspace(start=200, stop = 800, num = 10)],
            'max_features': ['log2'],
            'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11],
            'min_samples_split': [int(x) for x in np.linspace(start=20, stop = 100, num = 5)],
            'min_samples_leaf': [int(x) for x in np.linspace(start=10, stop = 50, num = 5)],
            'max_samples': [int(x) for x in np.linspace(start=100, stop = 600, num = 10)]}

## Creating the model to tune
rf = RandomForestClassifier(random_state=101)

## Random search procedure of parameters, using 3 fold cross validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=101, n_jobs = -1)

## Fitting the random search model to the previously defined resampled dataset using SMOTE
rf_random.fit(X_train_scaled, y_train)

## Retrieving the best model parameters according to this iteration
rf_random.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=8, max_features=log2, max_samples=211, min_samples_leaf=40, min_samples_split=20, n_estimators=266; total time=   1.8s
[CV] END max_depth=8, max_features=log2, max_samples=211, min_samples_leaf=40, min_samples_split=20, n_estimators=266; total time=   1.9s
[CV] END max_depth=8, max_features=log2, max_samples=211, min_samples_leaf=40, min_samples_split=20, n_estimators=266; total time=   1.9s
[CV] END max_depth=8, max_features=log2, max_samples=211, min_samples_leaf=40, min_samples_split=20, n_estimators=266; total time=   1.9s
[CV] END max_depth=10, max_features=log2, max_samples=155, min_samples_leaf=40, min_samples_split=80, n_estimators=266; total time=   2.1s
[CV] END max_depth=8, max_features=log2, max_samples=211, min_samples_leaf=40, min_samples_split=20, n_estimators=266; total time=   2.1s
[CV] END max_depth=10, max_features=log2, max_samples=155, min_samples_leaf=40, min_samples_split=80, n_estima

{'n_estimators': 266,
 'min_samples_split': 20,
 'min_samples_leaf': 40,
 'max_samples': 211,
 'max_features': 'log2',
 'max_depth': 8}

In [31]:
## GRADIENT BOOSTED DECISION TREE: IMPLEMENTATION FOR RQ4
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, balanced_accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
import numpy as np

## Assign data to X and Y
X = df.drop('health_status', axis=1).values
Y = df['health_status'].values

## Splitting the data into a training and testing set
X_main, X_test, y_main, y_test = train_test_split(X, Y, 
test_size = 0.37, random_state = 101)

## Splitting the main set into a training and validation set
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, 
test_size = 0.23, random_state = 101)

## Feature scaling using RobustScaler to put all features to the same scale
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

## Assigning sample weights
weights = np.zeros(len(y_train))
weights[y_train == 1] = 0.5
weights[y_train == 0] = 0.5

## Defining and fitting the model
model = GradientBoostingClassifier(random_state=101, subsample = 0.1,
n_estimators = 50, min_samples_split = 40, min_samples_leaf = 50, max_depth = 3,
learning_rate = 0.8)

model.fit(X_train_scaled, y_train, sample_weight = weights)

## Making predictions for every partition of the dataset 
pred_val = model.predict(X_val)
pred_train = model.predict(X_train)

## Evaluating the model performance
print('Balanced accuracy on validation data =', balanced_accuracy_score(y_val, pred_val))
print('Recall-score (positive class) on validation data =', recall_score(y_val, pred_val, pos_label = 1))
print('----------------------------------------------------------------------------------------------')
print('Checking for model overfitting by computing the evaluation metric on training data:')
print('-----------------')
print('Balanced accuracy on training data =', balanced_accuracy_score(y_train, pred_train))
print('Recall-score (positive class) on training data =', recall_score(y_train, pred_train, pos_label = 1))


Balanced accuracy on validation data = 0.5284742120343839
Recall-score (positive class) on validation data = 0.375
----------------------------------------------------------------------------------------------
Checking for model overfitting by computing the evaluation metric on training data:
-----------------
Balanced accuracy on training data = 0.504271638676556
Recall-score (positive class) on training data = 0.35516372795969775


In [32]:
## FINAL TRAINING ITERATION FOR GBDT - RQ4
## Feature scaling using RobustScaler to put all features to the same scale
scaler = RobustScaler()
X_main_scaled = scaler.fit_transform(X_main)
X_test_scaled = scaler.transform(X_test)

## Assigning sample weights
weights = np.zeros(len(y_main))
weights[y_main == 1] = 0.5
weights[y_main == 0] = 0.5

## Defining and fitting the model
model = GradientBoostingClassifier(random_state=101, subsample = 0.1,
n_estimators = 50, min_samples_split = 40, min_samples_leaf = 50, max_depth = 3,
learning_rate = 0.8)

model.fit(X_main_scaled, y_main, sample_weight = weights)

## Making predictions for the testing partition of the dataset
pred_test = model.predict(X_test_scaled)
pred_train = model.predict(X_main_scaled)

## Unbiased estimate on unseen data
print('Checking the final model performance by computing the evaluation metric on unseen testing data:')
print('-----------------')
print('Balanced accuracy on testing data =', balanced_accuracy_score(y_test, pred_test))
print('Recall-score (positive class) on testing data =', recall_score(y_test, pred_test, pos_label = 1))
print('-----------------')
print('Balanced accuracy on training data =', balanced_accuracy_score(y_main, pred_train))
print('Recall-score (positive class) on training data =', recall_score(y_main, pred_train, pos_label = 1))

Checking the final model performance by computing the evaluation metric on unseen testing data:
-----------------
Balanced accuracy on testing data = 0.5120936768149883
Recall-score (positive class) on testing data = 0.5782857142857143
-----------------
Balanced accuracy on training data = 0.5614519502796093
Recall-score (positive class) on training data = 0.6318859364873622


In [None]:
## HYPERPARAMETER TUNING PROCESS FOR RQ4 - GRADIENT BOOSTED DECISION TREE
## Source of code: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74 
## Random search iteration 1: 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import numpy as np

## Creating the random grid
random_grid = {'n_estimators': [int(x) for x in np.linspace(start=50, stop = 700, num = 10)],
            'learning_rate': [0.2, 0.5, 0.7, 0.9],
            'max_depth': [int(x) for x in np.linspace(start=3, stop = 15, num = 5)],
            'min_samples_split': [int(x) for x in np.linspace(start=5, stop = 100, num = 5)],
            'min_samples_leaf': [int(x) for x in np.linspace(start=5, stop = 100, num = 10)],
            'subsample': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0, 1.3, 1.5]}

## Creating the model to tune
gbdt = GradientBoostingClassifier(random_state=101)

## Random search procedure of parameters, using 3 fold cross validation
gbdt_random = RandomizedSearchCV(estimator = gbdt, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=101, n_jobs = -1)

## Fitting the random search model to the previously defined resampled dataset using SMOTE
gbdt_random.fit(X_train_scaled, y_train, sample_weight = weights)

## Retrieving the best model parameters according to this iteration
gbdt_random.best_params_