In [6]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold
from custom_functions import processing
from custom_functions import processing
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import statistics
import numpy as np
import statistics
import pandas as pd

In [9]:
file_path = 'P:\DATA_OCT_22\Expert_Eye\Dataset\Data\data_v10.xlsx'
dataset = pd.read_excel(file_path)
dataset = dataset.drop(['Foldername'], axis=1)
dataset.head()

Unnamed: 0,Gender,VINCQ32DDN,VINICODEX003,FROPCOM0001,FROPCOM0005,FROPCOM0006_S1_,FROPCOM0006_S2_,FROPCOM0006_S3_,FROPCOM0006_S4_,FROPCOM0006_S5_,...,HADS_D_Score,walk_time_4m,Item_1,Item_2,Item_3,Item_4,Item_5,Fried_Score,Fried_State,grip
0,0,76.0,0.0,0.0,3.0,,,,,1.0,...,9,8.45,0.0,0,1.0,1.0,,2,0,
1,0,75.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,...,9,6.11,0.0,0,1.0,1.0,1.0,3,1,21.5
2,0,67.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,10,20.0,0.0,0,0.0,1.0,1.0,2,0,23.2
3,0,72.0,1.0,0.0,1.0,,,,,,...,15,4.87,0.0,1,0.0,0.0,1.0,2,0,17.7
4,1,69.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10,3.48,0.0,0,0.0,0.0,1.0,1,0,40.0


In [8]:

# Models
models = {
    #'DecisionTree': DecisionTreeRegressor(),
    'XGBoost': XGBRegressor(),
    'RandomForest': RandomForestRegressor(),
}

# Hyperparameters
params = {
    #'DecisionTree': {
    #    'max_depth': [5, 7, 10],
    #    'min_samples_leaf': [2, 5, 10],
    #    'max_features': ['sqrt', 'log2'],
    #    'criterion': ['squared_error', 'friedman_mse']
    #},


    'XGBoost': {
        'n_estimators': [30, 50,100],
        'max_depth': [ 5, 7, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'min_child_weight': [1, 2, 4],
        },

    'RandomForest': {
        'n_estimators': [30, 50, 100],
        'max_depth': [5, 8, 10],
        'max_features': ['sqrt', 'log2', None],
        'criterion': ['squared_error', 'friedman_mse'],
        'bootstrap': [True, False],
    }
}

In [4]:
# Split data
n_splits = 5
best_model_params = {}
model_metrics = {}
feature_importances = {}

pred_vs_true = {}  # store true and predicted values for scatter plot

# Store the column names in a variable
feature_names = dataset.drop(['Fried_State','Fried_Score','Frailty_State','Frailty_Score'], axis=1).columns

X = dataset.drop(['Fried_State','Fried_Score','Frailty_State','Frailty_Score'], axis=1).values

y = dataset['Fried_Score'].values

# For regression, KFold instead of StratifiedKFold
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Loop through each Fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Impute missing values on train set
    X_train_imputed, best_k = processing(X_train, n_splits=n_splits, k_values=[1, 3, 5, 7, 9], verbose=False)

    X_test_imputed, best_k = processing(X_test, n_splits=n_splits, k_values=[1, 3, 5, 7, 9], verbose=False)

    # Loop through each model and perform grid search
    for model_name, model in models.items():
        print(f'Performing Grid Search for {model_name}...')

        grid_search = GridSearchCV(
            estimator=model, 
            param_grid=params[model_name], 
            cv=5, 
            scoring='neg_mean_squared_error', 
            n_jobs=-1, 
            verbose=1
            )

        if model_name not in model_metrics:
            model_metrics[model_name] = {'mae': [], 'mse': [], 'r2': []}

        grid_search.fit(X_train_imputed, y_train)

        best_model = grid_search.best_estimator_

        # Predictions
        y_pred = best_model.predict(X_test_imputed)

        # Metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        model_metrics[model_name]['mae'].append(mae)
        model_metrics[model_name]['mse'].append(mse)
        model_metrics[model_name]['r2'].append(r2)

        if model_name not in pred_vs_true:
            pred_vs_true[model_name] = {'true': [], 'pred': []}
        pred_vs_true[model_name]['true'].extend(y_test)
        pred_vs_true[model_name]['pred'].extend(y_pred)

# Calculate average and std metrics over all folds
for model_name in models.keys():
    for metric in model_metrics[model_name]:
        average_metric = sum(model_metrics[model_name][metric]) / n_splits
        std_metric = statistics.stdev(model_metrics[model_name][metric]) if len(model_metrics[model_name][metric]) > 1 else 0.0
        model_metrics[model_name][metric] = {'average': average_metric, 'std': std_metric}


# Display metrics
for model_name, metrics in model_metrics.items():
    print(f"Model: {model_name}")
    for metric, values in metrics.items():
        print(f"Average {metric}: {values['average']}, {values['std']}")

# Scatter Plot with Line of Best Fit
for model_name, data in pred_vs_true.items():
    plt.figure(figsize=(6, 6))

    # Scatter plot
    plt.scatter(data['true'], data['pred'], alpha=0.5)

    # Line of best fit
    x = np.array(data['true'])
    y = np.array(data['pred'])
    coeffs = np.polyfit(x, y, 1)
    line_function = np.poly1d(coeffs)
    x_line = np.linspace(min(x), max(x), 100)
    y_line = line_function(x_line)
    plt.plot(x_line, y_line, color='red', linestyle='--', linewidth=2)

    # Identity line
    plt.plot([min(x), max(x)], [min(x), max(x)], color='green', linestyle='-.', linewidth=2, label="Identity line")

    plt.title(f"{model_name} - True vs Predicted")
    plt.xlabel("True Values")
    plt.ylabel("Predicted Values")
    plt.show()

Performing Grid Search for XGBoost...
Fitting 5 folds for each of 81 candidates, totalling 405 fits


KeyboardInterrupt: 