In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2, mutual_info_classif, VarianceThreshold, SelectFromModel
from skfeature.function.similarity_based import fisher_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

from summarytools import dfSummary
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.pipeline import Pipeline as Pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

### Notebook parameters

In [None]:
# notebook parameters

input_dataset_path = "data/heart_disease_health_indicators_BRFSS2015.csv"
target_col = "HeartDiseaseorAttack"
separator = ','
generate_new_folds = False
n_splits = 5
k_best_features = 5

fix_imbalanced_dataset = True
use_oversampling = False
use_undersampling = True

### 1. Load dataset

In [None]:
heart_df = pd.read_csv(input_dataset_path, sep=separator)
heart_df.drop(columns=[col for col in heart_df.columns if col.lower()=='id'], inplace=True)
if all(isinstance(item, (int, float, np.int32, np.int64)) for item in list(heart_df[target_col].unique())):
    heart_df[target_col] = heart_df[target_col].astype(int)
heart_df.head()

#### 1.1 Check if there are NaN values present in the dataset

In [None]:
assert heart_df[heart_df.isna().any(axis=1)].empty, 'Dataset contains NaN values!'

#### 1.2 Get infromation about dataset shape and target values

In [None]:
# general dataset descriptors
print(f"Input dataset has {heart_df.shape[0]} rows and {heart_df.shape[1]} colums")
print(f"Input dataset consists of {heart_df.drop(columns=[target_col]).shape[1]} features and 1 target column")

print(f"Target values are: {heart_df[target_col].unique()}")
print(f"Number of classes in target: {heart_df[target_col].unique().shape[0]}")
print(f"Input dataset contains {heart_df[heart_df.duplicated()].shape[0]} duplicated rows and {heart_df[heart_df.duplicated()==False].shape[0]} unique rows")

#### 1.3 Check if the target is balanced - the target value distribution

In [None]:
plt.pie(x=heart_df[target_col].value_counts().values,labels=heart_df[target_col].unique().tolist(),autopct='%1.2f%%')
plt.title('Target Distribution')
plt.show()

#### 1.4 Print features correlation matrix

In [None]:
corr_matrix = heart_df.corr()
f,ax=plt.subplots(figsize=(10,7))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", linewidths=0.5, ax=ax)

In [None]:
heart_df.info()

In [None]:
dfSummary(heart_df)

In [None]:
# Countplot on each feature
plt.figure(figsize=(20,60))
for i,column in enumerate(heart_df.columns):
    plt.subplot(len(heart_df.columns), 5, i+1)
    plt.suptitle("Plot Value Count", fontsize=20, x=0.5, y=1)
    sns.countplot(data=heart_df, x=column)
    plt.title(f"{column}")
    plt.tight_layout()

In [None]:
bin_features = ['HighBP', 'HighChol', 'CholCheck','Smoker', 'Stroke','PhysActivity', 'Fruits', 'Veggies', 
                'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']
cat_features = ['Diabetes', 'GenHlth', 'Education','Income']
num_features = ['BMI','MentHlth', 'PhysHlth', 'Age']

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(heart_df)
print(heart_df.shape)
l=list(np.arange(22))
print(l)
plt.title("data distribution")
plt.xticks(l, list(heart_df.columns),
       rotation=90)
plt.show()

### 2. Preprocess dataset

#### 2.1 Drop duplicated values and separate features from target

In [None]:
# delete all duplicated values within the dataset
heart_df.drop_duplicates(inplace=True)

#### 2.2 Encode columns tahat have categorical data with LabelEncoder

In [None]:
label_encoders = {}

# Iterate through columns with categorical data
for column in heart_df.columns:
    if heart_df[column].dtype == 'object':
        le = LabelEncoder()
        heart_df[column] = le.fit_transform(heart_df[column])
        label_encoders[column] = le
        print(f'{column} was encoded')

In [None]:
# # Decode the encoded data
# decoded_heart_df = pd.DataFrame()
# for column in heart_df.columns:
#     if column in label_encoders:
#         decoded_values = label_encoders[column].inverse_transform(heart_df[column])
#         decoded_heart_df[column] = decoded_values
#     else:
#         decoded_heart_df[column] = heart_df[column]

#### 2.3 Separate features from target

In [None]:
# divide a heart failure dataset into features and target value sets
X = heart_df.drop(columns=[target_col])
y = heart_df[target_col]

### 3. Divide dataset into startified training and testing set (20%)

In [None]:
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=y)

### 4. Detect important features for feature selection

#### 4.1 Removal of all feature with low variance (threshold = 0.005)

In [None]:
vthresh = VarianceThreshold(threshold=0.005)
vthresh.fit_transform(X_train)
selected_features_vth = vthresh.get_feature_names_out()
print(f'Variance Threshold (0.005) selected features are: {selected_features_vth}')

#### 4.2 Chi2 test to define important features with alpha = 0.05

In [None]:
alpha = 0.05
f_score, p_values = chi2(X_train, y_train)
p_values = pd.Series(p_values)
p_values.index = X_train.columns
p_values.sort_values(ascending=False, inplace=True)
selected_features_chi2 = list(p_values[p_values < alpha].index)
print(f'Chi2 test selected features are: {selected_features_chi2}')

#### 4.3 Information gain - estimate mutual information with threshold 0.05

In [None]:
importances = mutual_info_classif(X_train,y_train)
feat_importances = pd.Series(importances, X_train.columns).sort_values()
feat_importances.plot(kind='barh', color='teal')
plt.show()

In [None]:
threshold = 0.01
selected_features_mi = list(feat_importances[feat_importances > threshold].index)
print(f'Mutual infromation estimation selected features are: {selected_features_mi}')

#### 4.4 Correlation based feature selection with threshold 0.8

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

high_corr_features = set(correlation(X_train, 0.8))
selected_features_corr = [item for item in X_train.columns if item not in high_corr_features]
print(f'Pearson correlation selected features are: {selected_features_corr}')

#### 4.5 Mean Absolute Difference (MAD) feature selection - works better for continuous / numerical data 

In [None]:
# mean_abs_diff = np.sum(np.abs(X_train - np.mean(X_train, axis=0)), axis=0) / X_train.shape[0]
# mad_feat_importances = pd.Series(mean_abs_diff, X_train.columns).sort_values()
# mad_feat_importances.plot(kind='barh', color='teal')
# plt.show()

In [None]:
# threshold = 0.5
# selected_features_mad = list(mad_feat_importances[mad_feat_importances > threshold].index)
# print(f'Mean Absolute Difference selected features are: {selected_features_mad}')

#### 4.6 LASSO Regularization (L1) feature selection embedded method

In [None]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train)
sfm = SelectFromModel(lsvc, prefit=True)

X_train_new = X_train.loc[:, sfm.get_support()]
selected_features_l1 = X_train_new.columns
print(f'Lasso L1 Regularization selected features are: {selected_features_l1}')

#### 4.7 Important features that were selected across all methods

In [None]:
vrh_sf = set(selected_features_vth)
chi2_sf = set(selected_features_chi2)
mi_sf = set(selected_features_mi)
corr_sf = set(selected_features_corr)
l1_sf = set(selected_features_l1)

common_features = list(vrh_sf.intersection(chi2_sf, mi_sf, corr_sf, l1_sf))
print(f'Across all methods there were {len(common_features)} selected. These are: {common_features}')

### 5. Modeling

#### 5.1 Initialize base classifiers

In [None]:
clf1 = LogisticRegression(multi_class='multinomial',
                          solver='newton-cg',
                          random_state=1)
clf2 = KNeighborsClassifier(algorithm='ball_tree',
                            leaf_size=50)
clf3 = DecisionTreeClassifier(random_state=1)
clf4 = SVC(kernel="linear", C=0.3, random_state=1)
clf5 = RandomForestClassifier(random_state=1)
clf6 = XGBClassifier(objective= 'binary:logistic',
                    nthread=4,
                    seed=42)
clf7 = GaussianNB()
clf8 = NearestCentroid(metric='euclidean', shrink_threshold=0.5)

In [None]:
if fix_imbalanced_dataset and use_oversampling:
    sampler = SMOTE(random_state=42)
elif fix_imbalanced_dataset and use_undersampling:
    sampler = RandomUnderSampler(random_state=42)

#### 5.2 Build pipelines with respected models

In [None]:
if fix_imbalanced_dataset:
    pipe1 = Pipeline_imb([('sampler', sampler),
                          ('scaler', StandardScaler()),
                          ('LR', clf1)])

    pipe2 = Pipeline_imb([('sampler', sampler),
                          ('scaler', StandardScaler()),
                          ('KNN', clf2)])
    
    pipe3 = Pipeline_imb([('sampler', sampler),
                          ('DT', clf3)])

    pipe4 = Pipeline_imb([('sampler', sampler),
                          ('scaler', StandardScaler()),
                          ('SVM', clf4)])
    
    pipe5 = Pipeline_imb([('sampler', sampler),
                          ('RF', clf5)])

    pipe6 = Pipeline_imb([('sampler', sampler),
                          ('XGB', clf6)])

    pipe7 = Pipeline_imb([('sampler', sampler),
                          ('scaler', StandardScaler()),
                          ('GNB', clf7)])

    pipe8 = Pipeline_imb([('sampler', sampler),
                          ('scaler', StandardScaler()),
                          ('NC', clf8)])

else:
    # Building the pipelines based on pre defined classifiers
    pipe1 = Pipeline([('scaler', StandardScaler()),
                    ('LR', clf1)])

    pipe2 = Pipeline([('scaler', StandardScaler()),
                    ('KNN', clf2)])

    pipe4 = Pipeline([('scaler', StandardScaler()),
                    ('SVM', clf4)])

    pipe7 = Pipeline([('scaler', StandardScaler()),
                    ('GNB', clf7)])

    pipe8 = Pipeline([('scaler', StandardScaler()),
                    ('NC', clf8)])

#### 5.3 Set up parameter grids for GridSearchCV hyperparameter tuning

In [None]:
# Logistic Regression parameters
param_grid1 = [{'LR__penalty': ['l1', 'l2'],
                'LR__C': np.power(10., np.arange(-4, 4))}]

# KNN parameters
param_grid2 = [{'KNN__n_neighbors': list(range(1, 10)),
                'KNN__p': [1, 2]}]

# Decision Trees parameters
param_grid3 = [{'DT__max_depth': list(range(1, 10)) + [None],
                'DT__criterion': ['gini', 'entropy']}]

# SVM parameters
param_grid4 = [{'SVM__kernel': ['rbf'],
                'SVM__C': np.power(10., np.arange(-4, 4)),
                'SVM__gamma': np.power(10., np.arange(-5, 0))},
               {'SVM__kernel': ['linear'],
                'SVM__C': np.power(10., np.arange(-4, 4))}]

# Random Forest parameters
param_grid5 = [{'RF__n_estimators': [10, 100, 500, 1000, 10000]}]

# XGBoost parameters
param_grid6 = [{'XGB__max_depth': range (2, 10, 1),
               'XGB__n_estimators': range(60, 220, 40),
               'XGB__learning_rate': [0.1, 0.01, 0.05]}]

# Gausian Naive Bayes parameters
param_grid7 = [{'GNB__var_smoothing': np.logspace(0,-9, num=100)}]

# Nearest Centraid parameters
param_grid8 = [{'NC__shrink_threshold': np.arange(0, 1.01, 0.01),
                'NC__metric': ['euclidean', 'manhattan']}]


In [None]:
# scoring = {
#     'accuracy' : make_scorer(accuracy_score), 
#     'precision' : make_scorer(precision_score),
#     'recall' : make_scorer(recall_score), 
#     'f1_score' : make_scorer(f1_score),
#     'mcc': make_scorer(matthews_corrcoef),
#     'roc_auc_score': make_scorer(roc_auc_score)
# }

#### 5.4. Set up multiple GridSearchCV objects, 1 for each algorithm

In [None]:
gridcvs = {}
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)

for pgrid, est, name in zip((param_grid3, param_grid7, param_grid8, param_grid5, param_grid1, param_grid6),
                            (pipe3, pipe7, pipe8, pipe5, pipe1, pipe6),
                            ('DT', 'GNB', 'NC', 'RF', 'LR', 'XGB')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                    #    scoring='accuracy',
                    #    scoring=scoring,
                       scoring='f1',
                       n_jobs=-1,
                       cv=inner_cv,
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv

#### 5.5 Tarin models and find best parameters

In [None]:
for name, gs_est in sorted(gridcvs.items()):

    print(50 * '-', '\n')
    print('Classification algorithm:', name)
    print('    Inner loop:')
    
    outer_scores = []
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    
    for train_idx, valid_idx in outer_cv.split(X_train, y_train):

        gridcvs[name].fit(X_train[common_features].iloc[train_idx], y_train.iloc[train_idx]) # run inner loop hyperparam tuning
        print('\n        Best F1 score (avg. of inner test folds) %.4f' % (gridcvs[name].best_score_))
        print('        Best parameters:', gridcvs[name].best_params_)
        
        # perf on test fold (valid_idx)
        outer_scores.append(gridcvs[name].best_estimator_.score(X_train[common_features].iloc[valid_idx], y_train.iloc[valid_idx]))
        print('        F1 score (on outer test fold) %.4f' % (outer_scores[-1]))
    
    print('\n    Outer Loop:')
    print('        F1 score %.4f +/- %.4f' % (np.mean(outer_scores), np.std(outer_scores)))

### 6. Best model training

In [None]:
gcv_model_select = GridSearchCV(estimator=pipe6,
                                param_grid=param_grid6,
                                scoring='f1',
                                n_jobs=-1,
                                cv=outer_cv,
                                verbose=1,
                                refit=True)

gcv_model_select.fit(X_train[common_features], y_train)
print('Best CV F1 score: %.4f' % (gcv_model_select.best_score_))
print('Best parameters:', gcv_model_select.best_params_)

### 7. Best model evaluation

In [None]:
## We can skip the next step because we set refit=True
## so scikit-learn has already fit the model to the
## whole training set

# gcv_model_select.fit(X_train, y_train)

train_acc = accuracy_score(y_true=y_train, y_pred=gcv_model_select.predict(X_train[common_features]))
test_acc = accuracy_score(y_true=y_test, y_pred=gcv_model_select.predict(X_test[common_features]))

train_recall = recall_score(y_true=y_train, y_pred=gcv_model_select.predict(X_train[common_features]))
test_recall = recall_score(y_true=y_test, y_pred=gcv_model_select.predict(X_test[common_features]))

train_precision = precision_score(y_true=y_train, y_pred=gcv_model_select.predict(X_train[common_features]))
test_precision = precision_score(y_true=y_test, y_pred=gcv_model_select.predict(X_test[common_features]))

train_f1 = f1_score(y_true=y_train, y_pred=gcv_model_select.predict(X_train[common_features]))
test_f1 = f1_score(y_true=y_test, y_pred=gcv_model_select.predict(X_test[common_features]))

tarin_mcc = matthews_corrcoef(y_true=y_train, y_pred=gcv_model_select.predict(X_train[common_features]))
test_mcc = matthews_corrcoef(y_true=y_test, y_pred=gcv_model_select.predict(X_test[common_features]))

tarin_roc_auc = roc_auc_score(y_true=y_train, y_score=gcv_model_select.predict(X_train[common_features]))
test_roc_auc = roc_auc_score(y_true=y_test, y_score=gcv_model_select.predict(X_test[common_features]))

train_scores = {
    'ACC': train_acc,
    'Recall': train_recall,
    'Precision': train_precision,
    'F1 score': train_f1,
    'MCC': tarin_mcc,
    'ROC AUC score': tarin_roc_auc
}

train_scores_df = pd.DataFrame(list(train_scores.items()), columns=['Metric name', 'Metric value'])
train_scores_df['Dataset'] = 'training'

test_scores = {
    'ACC': test_acc,
    'Recall': test_recall,
    'Precision': test_precision,
    'F1 score': test_f1,
    'MCC': test_mcc,
    'ROC AUC score': test_roc_auc
}

test_scores_df = pd.DataFrame(list(test_scores.items()), columns=['Metric name', 'Metric value'])
test_scores_df['Dataset'] = 'testing'

model_scores_df = pd.concat([train_scores_df, test_scores_df], axis=0)

# print('Training Accuracy: %.2f%%' % (100 * train_acc))
# print('Test Accuracy: %.2f%%' % (100 * test_acc))

In [None]:
model_scores_df

### 8. Save trained best model into pickle

In [None]:
import pickle
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

with open(f'models/xgb_best_model_{timestamp}_randomUndersampling.pickle', 'wb') as file:
    pickle.dump(gcv_model_select, file)

In [None]:
import openpyxl

model_scores_df.to_excel(f'models/xgb_best_model_results_{timestamp}_randomUndersampling.xlsx', index=False, header=True)