In [1]:
# Decision Trees model to predict the 'Frailty_Score of the patient'
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import tree

In [2]:
# import data
file_path = 'P:\DATA_OCT_22\Expert_Eye\Dataset\Data\global_dataset.xlsx'
df = pd.read_excel(file_path)
data = df.copy()

In [3]:
# Drop foldername column
data = data.drop(columns=['Foldername'])

# Drop the the row with missing values in the column 'Frailty_Score'
data = data.dropna(subset=['Frailty_Score'])

# Drop the columns with more than 50% missing values
data = data.dropna(thresh=0.5*len(data), axis=1)

In [None]:
# Use Regression imputation to fill the missing values
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Create a copy of the data
data_copy = data.copy()

# Create a list of columns that have missing values
cols_with_missing = [col for col in data_copy.columns
                        if data_copy[col].isnull().any()]

# Create a new column to indicate which columns have missing values
for col in cols_with_missing:
    data_copy[col + '_was_missing'] = data_copy[col].isnull()

# Imputation
my_imputer = IterativeImputer()
data_imputed = pd.DataFrame(my_imputer.fit_transform(data_copy))
data_imputed.columns = data_copy.columns

# Check if there are any missing values
data_imputed.isnull().sum()

In [4]:
# Split the data into train and test
X = data.drop(columns=['Frailty_Score'])
y = data['Frailty_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

In [6]:
# Predicting the test set results
y_pred = dt.predict(X_test)

In [7]:
# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error: ', mse)
print('Root Mean Squared Error: ', rmse)
print('R2 Score: ', r2)



Mean Squared Error:  1.4326934459578824
Root Mean Squared Error:  1.1969517308387512
R2 Score:  0.40552139171872115


In [8]:
# Hyperparameter Tuning
# Grid Search
param_grid = {'max_depth': np.arange(3, 10),
                'min_samples_leaf': np.arange(0.1, 0.5, 0.1),
                'min_samples_split': np.arange(0.1, 0.5, 0.1),
                'max_features': np.arange(0.1, 0.5, 0.1)}

dt_cv = GridSearchCV(dt, param_grid, cv=5)
dt_cv.fit(X_train, y_train)

print("Tuned Decision Tree Parameters: {}".format(dt_cv.best_params_))
print("Best score is {}".format(dt_cv.best_score_))



Tuned Decision Tree Parameters: {'max_depth': 5, 'max_features': 0.4, 'min_samples_leaf': 0.2, 'min_samples_split': 0.2}
Best score is 0.18818693628743713


In [10]:
# Cross Validation
cv_scores = cross_val_score(dt, X, y, cv=5)
print(cv_scores)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))



[ 0.17425836 -0.31540945  0.09421382  0.32836556  0.5761697 ]
Average 5-Fold CV Score: 0.1715195978489306
