In [None]:
#Import general libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Import libraries useful for building the pipeline and join their branches
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin


#import modules created for data preparation phase
import my_utils
import missing_val_imput
import feature_select
import preprocessing
import adhoc_transf

#import libraries for data preparation phase
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder


#import libraries from modelling phase
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

#to save model fit with GridSearchCV and avoid longer waits
import joblib


In [None]:
#15.6.23 The target feature is dealing with death event and time
#Loading the dataset
path_data=r'heart_failure_clinical_records_dataset.csv'

df=pd.read_csv(path_data)
df.head()

#%%Characterizing the data set
target_features=['DEATH_EVENT','time']
numerical_feats=['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium']
nominal_feats=['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']
ordinal_feats=[]

len_numerical_feats=len(numerical_feats)
len_nominal_feats=len(nominal_feats)
len_ordinal_feats=len(ordinal_feats)

In [None]:
#%%
###################################################################################################################
#Step 1 Solving wrong characters of dataset
###################################################################################################################
#Set column id as index


# CKD case does only have misspellingCorrector
# df_content_solver=Pipeline([('fx1', misspellingCorrector()),
#                             ('fx2',function2()),
#                             ('fx3',function3())
# ])

#%%
df=adhoc_transf.ageRounder().fit_transform(df)
#my_utils.df_values(df)

#%%Performing numeric cast for numerical features
df.loc[:,numerical_feats]=adhoc_transf.Numeric_Cast_Column().fit_transform(df.loc[:,numerical_feats])
df[numerical_feats].dtypes


#%%Performing category cast for nominal features
df.loc[:,nominal_feats]=adhoc_transf.Category_Cast_Column().fit_transform(df.loc[:,nominal_feats])
df[nominal_feats].dtypes

#%%Performing category cast for ordinal features
df.loc[:,ordinal_feats]=adhoc_transf.Category_Cast_Column().fit_transform(df.loc[:,ordinal_feats])
df[ordinal_feats].dtypes

#%%
###################################################################################################################
##Step 1.1 Winsorization strategies to set the outliers to the values of 10 and 90 percentiles
####################################################################


def winsorize_percentiles(df, columns, lower_percentile, upper_percentile):
    for column in columns:
        lower_limit = df[column].quantile(lower_percentile/100)
        upper_limit = df[column].quantile(upper_percentile/100)
        df[column] = df[column].clip(lower_limit, upper_limit)
    return df

# Example usage

columns_to_winsorize = ['creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium',]
lower_percentile = 0
upper_percentile = 90

df = winsorize_percentiles(df, columns_to_winsorize, lower_percentile, upper_percentile)


In [None]:
#%%
#Import the estimators for survival analysis
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxPHSurvivalAnalysis
#RF survival estimator
#GB survival estimator
from sksurv.svm import FastSurvivalSVM


#Import metrics
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)

from sksurv.metrics import (
    as_concordance_index_ipcw_scorer,
    as_cumulative_dynamic_auc_scorer,
    as_integrated_brier_score_scorer,
)

In [None]:
#%%
#15.6.2023
###################################################################################################################
##Step 2 Train-Test splitting
###################################################################################################################

#Split the dataset into train and test
#The stratificatin is made on the death_event
test_ratio_split=0.3
train_set,test_set=train_test_split(df, test_size=test_ratio_split, random_state=42, stratify=df['DEATH_EVENT'])

X_train=train_set.drop(target_features,axis=1)
y_train=train_set[target_features].copy()

X_test=test_set.drop(target_features,axis=1)
y_test=test_set[target_features].copy()

In [None]:
y_train['DEATH_EVENT']=y_train['DEATH_EVENT'].astype(bool)
y_test['DEATH_EVENT']=y_test['DEATH_EVENT'].astype(bool)

In [None]:
y_train=y_train.to_records(index=False)
y_test=y_test.to_records(index=False)

In [None]:
#%%
###################################################################################################################
##Step 4 Building pipelines for data preparation
###################################################################################################################

#Lets define 3 pipeline mode
#a) parallel approach where feature selection is performed in parallel 
# for numerical, nominal and categorical
#b) general approach where feature selection is performed as a whole for other features
#c) no feature selection is performed

#Before a data preprocessing will take place for each type of feature
pipeline_numeric_feat=Pipeline([ ('data_missing',missing_val_imput.Numeric_Imputer(strategy='median')),
                                 ('scaler', MinMaxScaler())])

pipeline_numeric_feat_mean=Pipeline([ ('data_missing',missing_val_imput.Numeric_Imputer(strategy='mean')),
                                 ('scaler', MinMaxScaler())])

pipeline_nominal_feat=Pipeline([('data_missing',missing_val_imput.Category_Imputer()),                                 
                                 ('encoding', OrdinalEncoder())])#We dont use OneHotEncoder since it enlarges the number of nominal features 

pipeline_ordinal_feat=Pipeline([ ('data_missing',missing_val_imput.Category_Imputer(strategy='most_frequent')),
                                 ('encoding', OrdinalEncoder())])


#option a)
pipe_numeric_featsel=Pipeline([('data_prep',pipeline_numeric_feat),
                                ('feat_sel',feature_select.Feature_Selector(strategy='wrapper_RFECV') )])
pipe_nominal_featsel=Pipeline([('data_prep',pipeline_nominal_feat),
                                ('feat_sel',feature_select.Feature_Selector(strategy='wrapper_RFECV') )])
pipe_ordinal_featsel=Pipeline([('data_prep',pipeline_ordinal_feat),
                                ('feat_sel',feature_select.Feature_Selector(strategy='wrapper_RFECV') )])

dataprep_pipe_opta=ColumnTransformer([('numeric_pipe',pipe_numeric_featsel,numerical_feats),
                                    ('nominal_pipe',pipe_nominal_featsel,nominal_feats),
                                    ('ordinal_pipe',pipe_ordinal_featsel,ordinal_feats)
                                ])

#option c)
dataprep_merge_feat=ColumnTransformer([('numeric_pipe',pipeline_numeric_feat,numerical_feats),
                                    ('nominal_pipe',pipeline_nominal_feat, nominal_feats),
                                    ('ordinal_pipe',pipeline_ordinal_feat,ordinal_feats)
                                ])

In [None]:
#Init the estimator here
cox=CoxPHSurvivalAnalysis()
survRF=
survGB=
survSVM=FastSurvivalSVM

In [None]:
###################################################################################################################
##Step 7 Training the data set with GridSearchCV
###################################################################################################################


##7.a.1 Parallel approach
###################################################################################################################
full_parallel_pipe_opta=Pipeline([('data_prep',dataprep_pipe_opta),('est',cox)])
full_parallel_pipe_opta.get_params().keys()

In [None]:
# This is not valid 
scoring = {
    'c_index_censored': make_scorer(concordance_index_censored()),
    'c_index_ipcw': make_scorer(concordance_index_ipcw),
    'cd_auc': make_scorer(cumulative_dynamic_auc),
    'brier_score': make_scorer(integrated_brier_score),
    }

In [None]:
#Example to see the an individual fitting
cox=CoxPHSurvivalAnalysis()
full_parallel_pipe_opta=Pipeline([('data_prep',dataprep_pipe_opta),('clf',cox)])
cox.fit(X_train,y_train)

In [None]:
#it is needed to create different GridSearcCV for each of the metric considered



gcv_cindex = GridSearchCV(
    as_concordance_index_ipcw_scorer(cox, tau=y_train_time[-1]),
    param_grid=cv_param_grid,
    cv=cv,
    n_jobs=4,
).fit(gbsg_X, gbsg_y)

In [None]:
###################################################################################################################
#Step 8: Application of SCI-XAI per each type of survival analysis estimator
###################################################################################################################

#vcox_Cindex: Cox's proportional hazards model and concordance index ipcw as scorer
cox=CoxPHSurvivalAnalysis()
y_train_time=y_train['time']
full_parallel_pipe_opta=Pipeline([('data_prep',dataprep_pipe_opta),('clf',cox)])
##########################################################################################################################################
#%%
param_grid_vcox_Cindex_exp={
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['mean','median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
                    }

clf_vcox_exp_Cindex=GridSearchCV(as_concordance_index_ipcw_scorer(full_parallel_pipe_opta, tau=y_train_time[-1]),param_grid_vcox_Cindex_exp, cv=5,n_jobs=None)
clf_vcox_exp_Cindex.fit(X_train,y_train)
#%%
print('Score of best estimator of clf_vDT_exp:', clf_vcox_exp.best_score_) #Score of best estimator of clf_vcox:0.8063281546040166

#%%
#Saving the results in an excel
df_results_vDT_exp=pd.DataFrame(clf_vcox_exp.cv_results_)
df_results_vDT_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization_SurvAnalysis/df_results_vcox_exp.xlsx',index=False)
#Saving the model
joblib.dump(clf_vcox_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization_SurvAnalysis/clf_vcox_exp.pkl', compress=1)



In [None]:
#%%
#Obtaining restuls with the test set
clf_vcox_exp.refit
y_pred_vDT_exp = clf_vcox_exp.predict(X_test)

test_results_DT={'clf':['clf_vDT_exp'],
                 'params':[clf_vcox_exp.best_params_],
                 'cindex_censored_test':[concordance_index_censored(y_test, y_pred_vDT_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vDT_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vDT_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vDT_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vDT_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vDT_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vDT_exp)]    
    }
#%%
test_results_DT_paper=pd.DataFrame(data=test_results_DT)
test_results_DT_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/test_results_DT_paper.xlsx',index=False)

In [None]:
###################################################################################################################
#Step 8_notime: Application of SCI-XAI per each type of classifier
###################################################################################################################

#vDT:DecisionTree_notime
##########################################################################################################################################
#%%
param_grid_vDT_exp={'clf':[dectree_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['mean','median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
                    }

clf_vDT_exp=GridSearchCV(full_parallel_pipe_opta_notime,param_grid_vDT_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vDT_exp.fit(X_train_notime,y_train)
#%%
print('Score of best estimator of clf_vDT_exp:', clf_vDT_exp.best_score_) #Score of best estimator of clf_vDT:0.8063281546040166

#%%
#Saving the results in an excel
df_results_vDT_exp=pd.DataFrame(clf_vDT_exp.cv_results_)
df_results_vDT_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vDT_exp_notime.xlsx',index=False)
#Saving the model
joblib.dump(clf_vDT_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vDT_exp_notime.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vDT_exp.refit
y_pred_vDT_exp = clf_vDT_exp.predict(X_test_notime)

test_results_DT={'clf':['clf_vDT_exp'],
                 'params':[clf_vDT_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vDT_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vDT_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vDT_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vDT_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vDT_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vDT_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vDT_exp)]    
    }
#%%
test_results_DT_paper=pd.DataFrame(data=test_results_DT)
test_results_DT_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/test_results_DT_paper_notime.xlsx',index=False)


In [None]:
#vRF:Random Forest
###################################################################################################################
#%%
param_grid_vRF_exp={'clf': [rndforest_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
     }

clf_vRF_exp=GridSearchCV(full_parallel_pipe_opta,param_grid_vRF_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vRF_exp.fit(X_train,y_train)
#%%
print('Score of best estimator of clf_vRF_exp:', clf_vRF_exp.best_score_) #Score of best estimator of clf_vRF: 1

#Saving the results in an excel
df_results_vRF_exp=pd.DataFrame(clf_vRF_exp.cv_results_)
df_results_vRF_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vRF_exp.xlsx',index=False)
#Saving the model
joblib.dump(clf_vRF_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vRF_exp.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vRF_exp.refit
y_pred_vRF_exp = clf_vRF_exp.predict(X_test)

test_results_RF={'clf':['clf_vRF_exp'],
                 'params':[clf_vRF_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vRF_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vRF_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vRF_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vRF_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vRF_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vRF_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vRF_exp)]    
    }
#%%
test_results_RF_paper=pd.DataFrame(data=test_results_RF)
test_results_RF_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/test_results_RF_paper.xlsx',index=False)


In [None]:
#vRF:Random Forest_notime
###################################################################################################################
#%%
param_grid_vRF_exp={'clf': [rndforest_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
     }

clf_vRF_exp=GridSearchCV(full_parallel_pipe_opta_notime,param_grid_vRF_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vRF_exp.fit(X_train_notime,y_train)
#%%
print('Score of best estimator of clf_vRF_exp:', clf_vRF_exp.best_score_) #Score of best estimator of clf_vRF: 1

#Saving the results in an excel
df_results_vRF_exp=pd.DataFrame(clf_vRF_exp.cv_results_)
df_results_vRF_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vRF_exp_notime.xlsx',index=False)
#Saving the model
joblib.dump(clf_vRF_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vRF_exp_notime.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vRF_exp.refit
y_pred_vRF_exp = clf_vRF_exp.predict(X_test_notime)

test_results_RF={'clf':['clf_vRF_exp'],
                 'params':[clf_vRF_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vRF_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vRF_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vRF_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vRF_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vRF_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vRF_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vRF_exp)]    
    }
#%%
test_results_RF_paper=pd.DataFrame(data=test_results_RF)
test_results_RF_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/test_results_RF_paper_notime.xlsx',index=False)

In [None]:
#%%
#vET:Extra Trees
###################################################################################################################
#%%
param_grid_vET_exp={'clf':[extratree_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['mean','median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
                    }

clf_vET_exp=GridSearchCV(full_parallel_pipe_opta,param_grid_vET_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vET_exp.fit(X_train,y_train)
#%%
print('Score of best estimator of clf_vET_exp:', clf_vET_exp.best_score_) #Score of best estimator of clf_vET:0.7623721106479727

#%%
#Saving the results in an excel
df_results_vET_exp=pd.DataFrame(clf_vET_exp.cv_results_)
df_results_vET_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vET_exp.xlsx',index=False)
#Saving the model
joblib.dump(clf_vET_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vET_exp.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vET_exp.refit
y_pred_vET_exp = clf_vET_exp.predict(X_test)

test_results_ET={'clf':['clf_vET_exp'],
                 'params':[clf_vET_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vET_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vET_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vET_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vET_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vET_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vET_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vET_exp)]    
    }
#%%
test_results_ET_paper=pd.DataFrame(data=test_results_ET)
test_results_ET_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/test_results_ET_paper.xlsx',index=False)



In [None]:
#%%
#vET:Extra Trees_notime
###################################################################################################################
#%%
param_grid_vET_exp={'clf':[extratree_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['mean','median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
                    }

clf_vET_exp=GridSearchCV(full_parallel_pipe_opta_notime,param_grid_vET_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vET_exp.fit(X_train_notime,y_train)
#%%
print('Score of best estimator of clf_vET_exp:', clf_vET_exp.best_score_) #Score of best estimator of clf_vET:0.7623721106479727

#%%
#Saving the results in an excel
df_results_vET_exp=pd.DataFrame(clf_vET_exp.cv_results_)
df_results_vET_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vET_exp_notime.xlsx',index=False)
#Saving the model
joblib.dump(clf_vET_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vET_exp_notime.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vET_exp.refit
y_pred_vET_exp = clf_vET_exp.predict(X_test_notime)

test_results_ET={'clf':['clf_vET_exp'],
                 'params':[clf_vET_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vET_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vET_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vET_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vET_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vET_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vET_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vET_exp)]    
    }
#%%
test_results_ET_paper=pd.DataFrame(data=test_results_ET)
test_results_ET_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/test_results_ET_paper_notime.xlsx',index=False)


In [None]:
#%%
#vAB:AdaBoost
###################################################################################################################
#%%
param_grid_vAB_exp={'clf':[ada_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['mean','median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
                    }

clf_vAB_exp=GridSearchCV(full_parallel_pipe_opta,param_grid_vAB_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vAB_exp.fit(X_train,y_train)
#%%
print('Score of best estimator of clf_vAB_exp:', clf_vAB_exp.best_score_) #Score of best estimator of clf_vAB:0.7623721106479727

#%%
#Saving the results in an excel
df_results_vAB_exp=pd.DataFrame(clf_vAB_exp.cv_results_)
df_results_vAB_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vAB_exp.xlsx',index=False)
#Saving the model
joblib.dump(clf_vAB_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vAB_exp.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vAB_exp.refit
y_pred_vAB_exp = clf_vAB_exp.predict(X_test)

test_results_AB={'clf':['clf_vAB_exp'],
                 'params':[clf_vAB_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vAB_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vAB_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vAB_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vAB_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vAB_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vAB_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vAB_exp)]    
    }
#%%
test_results_AB_paper=pd.DataFrame(data=test_results_AB)
test_results_AB_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/test_results_AB_paper.xlsx',index=False)


In [None]:
#%%
#vAB:AdaBoost_notime
###################################################################################################################
#%%
param_grid_vAB_exp={'clf':[ada_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['mean','median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
                    }

clf_vAB_exp=GridSearchCV(full_parallel_pipe_opta_notime,param_grid_vAB_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vAB_exp.fit(X_train_notime,y_train)
#%%
print('Score of best estimator of clf_vAB_exp:', clf_vAB_exp.best_score_) #Score of best estimator of clf_vAB:0.7623721106479727

#%%
#Saving the results in an excel
df_results_vAB_exp=pd.DataFrame(clf_vAB_exp.cv_results_)
df_results_vAB_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vAB_exp_notime.xlsx',index=False)
#Saving the model
joblib.dump(clf_vAB_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vAB_exp_notime.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vAB_exp.refit
y_pred_vAB_exp = clf_vAB_exp.predict(X_test_notime)

test_results_AB={'clf':['clf_vAB_exp'],
                 'params':[clf_vAB_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vAB_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vAB_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vAB_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vAB_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vAB_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vAB_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vAB_exp)]    
    }
#%%
test_results_AB_paper=pd.DataFrame(data=test_results_AB)
test_results_AB_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/test_results_AB_paper_notime.xlsx',index=False)

In [None]:
#%%
#vGB:Gradient Boosting
###################################################################################################################
#%%
param_grid_vGB_exp={'clf':[gradboost_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['mean','median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
                    }

clf_vGB_exp=GridSearchCV(full_parallel_pipe_opta,param_grid_vGB_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vGB_exp.fit(X_train,y_train)
#%%
print('Score of best estimator of clf_vGB_exp:', clf_vGB_exp.best_score_) #Score of best estimator of clf_vGB:0.7623721106479727

#%%
#Saving the results in an excel
df_results_vGB_exp=pd.DataFrame(clf_vGB_exp.cv_results_)
df_results_vGB_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vGB_exp.xlsx',index=False)
#Saving the model
joblib.dump(clf_vGB_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vGB_exp.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vGB_exp.refit
y_pred_vGB_exp = clf_vGB_exp.predict(X_test)

test_results_GB={'clf':['clf_vGB_exp'],
                 'params':[clf_vGB_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vGB_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vGB_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vGB_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vGB_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vGB_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vGB_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vGB_exp)]    
    }
#%%
test_results_GB_paper=pd.DataFrame(data=test_results_GB)
test_results_GB_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/test_results_GB_paper.xlsx',index=False)



In [None]:
#%%
#vGB:Gradient Boosting_notime
###################################################################################################################
#%%
param_grid_vGB_exp={'clf':[gradboost_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['mean','median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
                    }

clf_vGB_exp=GridSearchCV(full_parallel_pipe_opta_notime,param_grid_vGB_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vGB_exp.fit(X_train_notime,y_train)
#%%
print('Score of best estimator of clf_vGB_exp:', clf_vGB_exp.best_score_) #Score of best estimator of clf_vGB:0.7623721106479727

#%%
#Saving the results in an excel
df_results_vGB_exp=pd.DataFrame(clf_vGB_exp.cv_results_)
df_results_vGB_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vGB_exp_notime.xlsx',index=False)
#Saving the model
joblib.dump(clf_vGB_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vGB_exp_notime.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vGB_exp.refit
y_pred_vGB_exp = clf_vGB_exp.predict(X_test_notime)

test_results_GB={'clf':['clf_vGB_exp'],
                 'params':[clf_vGB_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vGB_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vGB_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vGB_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vGB_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vGB_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vGB_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vGB_exp)]    
    }
#%%
test_results_GB_paper=pd.DataFrame(data=test_results_GB)
test_results_GB_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/test_results_GB_paper_notime.xlsx',index=False)


In [None]:
#%%
#vXGB:eXtreme Gradient Boosting
###################################################################################################################
#%%
param_grid_vXGB_exp={'clf':[xgboost_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['mean','median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
                    }

clf_vXGB_exp=GridSearchCV(full_parallel_pipe_opta,param_grid_vXGB_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vXGB_exp.fit(X_train,y_train)
#%%
print('Score of best estimator of clf_vXGB_exp:', clf_vXGB_exp.best_score_) #Score of best estimator of clf_vXGB:0.7623721106479727

#%%
#Saving the results in an excel
df_results_vXGB_exp=pd.DataFrame(clf_vXGB_exp.cv_results_)
df_results_vXGB_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vXGB_exp.xlsx',index=False)
#Saving the model
joblib.dump(clf_vXGB_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vXGB_exp.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vXGB_exp.refit
y_pred_vXGB_exp = clf_vXGB_exp.predict(X_test)

test_results_XGB={'clf':['clf_vXGB_exp'],
                 'params':[clf_vXGB_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vXGB_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vXGB_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vXGB_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vXGB_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vXGB_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vXGB_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vXGB_exp)]    
    }
#%%
test_results_XGB_paper=pd.DataFrame(data=test_results_XGB)
test_results_XGB_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/est_results_XGB_paper.xlsx',index=False)


In [None]:
#%%
#vXGB:eXtreme Gradient Boosting_notime
###################################################################################################################
#%%
param_grid_vXGB_exp={'clf':[xgboost_clf],
            'data_prep__numeric_pipe__data_prep__data_missing__strategy':['mean','median'],
                    'data_prep__numeric_pipe__feat_sel__k_out_features':[*range(1,len_numerical_feats+1)],
                    'data_prep__numeric_pipe__feat_sel__strategy':['filter_num','filter_mutinf','wrapper_RFE'],
                    'data_prep__nominal_pipe__feat_sel__k_out_features':[*range(1,len_nominal_feats+1)],
                    'data_prep__nominal_pipe__feat_sel__strategy':['filter_cat','filter_mutinf','wrapper_RFE']
                    }

clf_vXGB_exp=GridSearchCV(full_parallel_pipe_opta_notime,param_grid_vXGB_exp,scoring=scoring,refit='balanced_accuracy', cv=5,n_jobs=None)
clf_vXGB_exp.fit(X_train_notime,y_train)
#%%
print('Score of best estimator of clf_vXGB_exp:', clf_vXGB_exp.best_score_) #Score of best estimator of clf_vXGB:0.7623721106479727

#%%
#Saving the results in an excel
df_results_vXGB_exp=pd.DataFrame(clf_vXGB_exp.cv_results_)
df_results_vXGB_exp.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/df_results_vXGB_exp_notime.xlsx',index=False)
#Saving the model
joblib.dump(clf_vXGB_exp, r'GridSearchCV_results/HF_case_fullpaper_winsorization/clf_vXGB_exp_notime.pkl', compress=1)

#%%
#Obtaining classification  with test set
clf_vXGB_exp.refit
y_pred_vXGB_exp = clf_vXGB_exp.predict(X_test_notime)

test_results_XGB={'clf':['clf_vXGB_exp'],
                 'params':[clf_vXGB_exp.best_params_],
                 'accuracy_test':[accuracy_score(y_test, y_pred_vXGB_exp)],
                 'balanced_accuracy_test':[balanced_accuracy_score(y_test, y_pred_vXGB_exp)],
                 'f1_test':[f1_score(y_test, y_pred_vXGB_exp)],
                 'precision_test':[precision_score(y_test, y_pred_vXGB_exp)],
                 'recall_test':[recall_score(y_test, y_pred_vXGB_exp)],
                 'specificity_test':[recall_score(y_test, y_pred_vXGB_exp,pos_label=0)],
                 'roc_auc_test':[roc_auc_score(y_test, y_pred_vXGB_exp)]    
    }
#%%
test_results_XGB_paper=pd.DataFrame(data=test_results_XGB)
test_results_XGB_paper.to_excel(r'GridSearchCV_results/HF_case_fullpaper_winsorization/est_results_XGB_paper_notime.xlsx',index=False)