In [None]:
import pandas as pd
import numpy as np
import os

from data_preprocessing import encode_categorical_features
# from sampling import create_train_test_sets, create_stratified_kfolds
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel, f_classif

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, \
    matthews_corrcoef, mean_squared_error, r2_score, roc_auc_score, roc_curve, auc
from math import sqrt

In [None]:
# notebook parameters
input_dataset_path = 'data/heart_disease_health_indicators_BRFSS2015.csv'
target_col = 'HeartDiseaseorAttack'
n_splits = 5
k_best_features = 10

In [None]:
def calculate_metrics(y_pred: np.array, y_test: pd.Series):
    """ Calculate model quality metrics based on 
        expected label values from testing dataset (y_test) and predicted values.
    """
    tn, fp, fn, tp = calculate_test_results_from_confusion_matrix(y_test, y_pred)
    model_precision = precision_score(y_test, y_pred)
    model_recall = recall_score(y_test, y_pred) # sensitivity
    model_specificity = specificity_score(tn, fp)
    model_acc = accuracy_score(y_test, y_pred)
    
    model_f1_score = f1_score(y_test, y_pred)
    model_mcc = matthews_corrcoef(y_test, y_pred)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    model_r2 = r2_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    # model_classification_report = classification_report(y_test, y_pred)

    model_scores = {
        'True Negative': tn,
        'False Positive': fp,
        'False Negative': fn,
        'True Positive': tp,
        'Precision (PPV)': model_precision,
        'Sensitivity (TPR, Recall)': model_specificity,
        'Accuracy': model_acc,
        'F1 Score': model_f1_score,
        'RMSE': rmse,
        'R Squared': model_r2,
        'Matthews Correlation Coefficient (MCC)': model_mcc,
        'ROC AUC score': roc_auc
    }

    return model_scores

def calculate_test_results_from_confusion_matrix(y_test: pd.DataFrame, y_pred: pd.DataFrame):
    """ Calculate the confusion matrix and extract TP, FP, TN, FN from that matrix """
    conf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()

    return tn, fp, fn, tp

def specificity_score(tn: float, fp: float):
    return tn / (tn + fp)

### Loading dataset

In [None]:
heart_df = pd.read_csv(input_dataset_path)
heart_df[target_col] = heart_df[target_col].astype(int)
heart_df.head()

In [None]:
# general dataset descriptors
print(f"Input dataset has {heart_df.shape[0]} rows and {heart_df.shape[1]} colums")
print(f"Input dataset consists of {heart_df.drop(columns=[target_col]).shape[1]} features and 1 target column")

print(f"Target values are: {heart_df[target_col].unique()}")
print(f"Input dataset contains {heart_df[heart_df.duplicated()].shape[0]} duplicated rows and {heart_df[heart_df.duplicated()==False].shape[0]} unique rows")

In [None]:
heart_df.info()

### Data preprocessing

In [None]:
# delete all duplicated values within the dataset
heart_df.drop_duplicates(inplace=True)

In [None]:
# encode categorical features using LabelEncoder and OneHotEncoding
heart_df = encode_categorical_features(heart_df, target_col)

In [None]:
# divide a heart failure dataset into features and target value sets
x = heart_df.drop(columns=[target_col])
y = heart_df[target_col]

### Data stratified sampling

In [None]:
# Stratified sampling into 5 folds
k_fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# some models are sensitive for feature value scale differences
# StandardScaler is used to transform those values into rescaled format, without information loss

# Create an instance of StandardScaler
scaler = StandardScaler()

# Initialize an empty dict to store model performance results for separate folds
folds_results = {}

In [None]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=11, criterion='entropy'),
    'Pruning Decision Tree': DecisionTreeClassifier(random_state=11, ccp_alpha=0.02, criterion='entropy'),
    'Logistic Regression': LogisticRegression(random_state=0, C=10, penalty='l2'),
    'Support Vector Machine': SVC(kernel='linear', C=0.3),
    'K Nearest Neighbours': KNeighborsClassifier(leaf_size=1, n_neighbors=3),
    'Gaussina Naive Bayes': GaussianNB(),
    'XGBoost Classifier': XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
    # ,'CatBoost Classifier': CatBoostClassifier(verbose=False)
}

In [None]:
# for fold_num, (train_idx, test_idx) in enumerate(k_fold.split(x, y), 1):
#     train_data = heart_df.iloc[train_idx]
#     test_data = heart_df.iloc[test_idx]

#     # Standardize features using StandardScaler
#     X_train_scaled = scaler.fit_transform(train_data.drop(target_col, axis=1))
#     X_test_scaled = scaler.transform(test_data.drop(target_col, axis=1))

#     # Feature Selection (e.g., using SelectKBest with ANOVA F-statistic)
#     # Replace 'k' with the number of features you want to select
#     selector = SelectKBest(score_func=f_classif, k=k_best_features)
#     X_train_selected = selector.fit_transform(X_train_scaled, train_data[target_col])
#     X_test_selected = selector.transform(X_test_scaled)

#     models_results_dict = {}
#     # Model Building
#     for model_name, model in models.items():
#         # train model
#         model.fit(X_train_selected, train_data[target_col])

#         # Model Evaluation (e.g., accuracy score)
#         y_pred = model.predict(X_test_selected)
#         accuracy = accuracy_score(test_data[target_col], y_pred)
#         models_results_dict[model_name] = accuracy

#     folds_results[fold_num] = models_results_dict

In [None]:
df = pd.DataFrame.from_dict(folds_results) 
df.to_csv('accuracy_results.csv', index=False, header=True)

In [None]:
import json
with open('accuracy_results.json', 'w') as f:
    json.dump(folds_results, f)

In [None]:
for fold_num, (train_idx, test_idx) in enumerate(k_fold.split(x, y), 1):
    train_data = heart_df.iloc[train_idx]
    test_data = heart_df.iloc[test_idx]

    # Create a directory for the folds if it doesn't exist
    if not os.path.exists('folds'):
        os.makedirs('folds')

    # Save each fold as a CSV file
    train_data.to_csv(f'folds/fold_{fold_num}_train.csv', index=False)
    test_data.to_csv(f'folds/fold_{fold_num}_test.csv', index=False)

    # Standardize features using StandardScaler
    X_train_scaled = scaler.fit_transform(train_data.drop(target_col, axis=1))
    X_test_scaled = scaler.transform(test_data.drop(target_col, axis=1))

    # Feature Selection (e.g., using SelectKBest with ANOVA F-statistic)
    # Replace 'k' with the number of features you want to select
    selector = SelectKBest(score_func=f_classif, k=k_best_features)
    X_train_selected = selector.fit_transform(X_train_scaled, train_data[target_col])
    X_test_selected = selector.transform(X_test_scaled)

    models_results_dict = {}
    # Model Building
    for model_name, model in models.items():
        # train model
        model.fit(X_train_selected, train_data[target_col])

        # Model Evaluation (e.g., accuracy score)
        y_pred = model.predict(X_test_selected)
        model_scores = calculate_metrics(y_test=test_data[target_col], y_pred=y_pred)
        models_results_dict[model_name] = model_scores

    folds_results[fold_num] = models_results_dict

df_2 = pd.DataFrame.from_dict(folds_results) 
df_2.to_csv('model_evaluation_results.csv', index=False, header=True)

In [None]:
folds_results