In [1]:
from __future__ import print_function
from packaging.version import parse as Version
from platform import python_version

OK = '\x1b[42m[ OK ]\x1b[0m'
FAIL = "\x1b[41m[FAIL]\x1b[0m"

try:
    import importlib
except ImportError:
    print(FAIL, "Python version 3.11 is required,"
                " but %s is installed." % sys.version)

def import_version(pkg, min_ver, fail_msg=""):
    mod = None
    try:
        mod = importlib.import_module(pkg)
        if pkg in {'PIL'}:
            ver = mod.VERSION
        else:
            ver = mod.__version__
        if Version(ver) == Version(min_ver):
            print(OK, "%s version %s is installed."
                  % (lib, min_ver))
        else:
            print(FAIL, "%s version %s is required, but %s installed."
                  % (lib, min_ver, ver))    
    except ImportError:
        print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
    return mod


# first check the python version
pyversion = Version(python_version())

if pyversion >= Version("3.11.4"):
    print(OK, "Python version is %s" % pyversion)
elif pyversion < Version("3.11"):
    print(FAIL, "Python version 3.11 is required,"
                " but %s is installed." % pyversion)
else:
    print(FAIL, "Unknown Python version: %s" % pyversion)

    
print()
requirements = {'numpy': "1.24.4", 'matplotlib': "3.7.2",'sklearn': "1.3.0", 
                'pandas': "2.0.3",'xgboost': "1.7.6", 'shap': "0.42.1", 'seaborn': "0.12.2"}

# now the dependencies
for lib, required_version in list(requirements.items()):
    import_version(lib, required_version)

[42m[ OK ][0m Python version is 3.11.4

[42m[ OK ][0m numpy version 1.24.4 is installed.
[42m[ OK ][0m matplotlib version 3.7.2 is installed.
[42m[ OK ][0m sklearn version 1.3.0 is installed.
[42m[ OK ][0m pandas version 2.0.3 is installed.
[42m[ OK ][0m xgboost version 1.7.6 is installed.


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


[42m[ OK ][0m shap version 0.42.1 is installed.
[42m[ OK ][0m seaborn version 0.12.2 is installed.


In [2]:
# Data Handling and Array Manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Preprocessing and Pipeline Tools
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

# Machine Learning Models
from sklearn.linear_model import LogisticRegression

# Model Selection and Evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, fbeta_score, auc,
    precision_recall_curve, average_precision_score
)

# Miscellaneous
import warnings
#warnings.filterwarnings('ignore')
import pickle

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

df = pd.read_csv('data/CVD_cleaned.csv')
df.head()
ftrs = df.columns

random_state = 56

X = df.drop(labels=['Heart_Disease'], axis=1)
y = df['Heart_Disease']

label_mapping = {'No': 0, 'Yes': 1}
y = y.map(label_mapping)

X_subset, _, y_subset, _ = train_test_split(X, y, test_size=0.99, stratify=y, random_state=56)

print("Full Set")
print('X:',X.shape)
print('y:',y.shape)
#print("Subset Set")
#print('X_subset:',X_subset.shape)
#print('y_subset:',y_subset.shape)

cat_ftrs = ['Checkup','Exercise','Skin_Cancer','Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex','Smoking_History']
ordinal_ftrs = ['General_Health','Age_Category',]
ordinal_cats = [['Poor','Fair','Good','Very Good','Excellent'],\
               ['18-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79','80+']]
num_ftrs = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption',
       'Green_Vegetables_Consumption', 'FriedPotato_Consumption']

# one-hot encoder
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# ordinal encoder
ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories = ordinal_cats))])

# standard scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs),
        ('ord', ordinal_transformer, ordinal_ftrs)])
final_scaler = StandardScaler()
prep = Pipeline(steps=[('preprocessor', preprocessor)])

Full Set
X: (308854, 18)
y: (308854,)


### baseline accuracy

In [4]:
N, Y = df['Heart_Disease'].value_counts()
baseline_accuracy = Y/(N+Y)
print('baseline accuracy score for the entire dataset is', baseline_accuracy)

baseline accuracy score for the entire dataset is 0.08085049894124732


### Logistic Regression

In [5]:
nr_states = 5
lg_scores = []
lg_models = []
lg_test_sets = []

for i in range(nr_states):
    print('Random state', i + 1)

    # Splitting the dataset
    X_train, X_other, y_train, y_other = train_test_split(X, y, train_size=0.6, stratify=y, random_state=56*i)
    X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, train_size=0.5, stratify=y_other, random_state=56*i)
    print(X_test.shape)

    # Define the pipeline
    pipe = make_pipeline(prep, LogisticRegression(solver='saga', max_iter=100000, penalty='elasticnet'))

    # Define the parameter grid
    param_grid = {
        'logisticregression__l1_ratio': [0, 0.25, 0.5, 0.75, 1],  # Values between 0 and 1
        'logisticregression__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
        'logisticregression__class_weight': [None, 'balanced']  # Class imbalance
    }

    # GridSearchCV with the pipeline
    grid = GridSearchCV(pipe, param_grid, cv=3, scoring='recall', verbose=1, n_jobs=-1)
    grid.fit(X_train, y_train)

    # Best model
    best_model = grid.best_estimator_
    lg_models.append(best_model)
    #print(f"Best parameters for random state {i + 1}: {grid.best_params_}")

    # Evaluate and save the test set
    for X_save, y_save, dataset_name_save in [(X_train, y_train, 'train'), (X_val, y_val, 'validation'), (X_test, y_test, 'test')]:
        y_pred = best_model.predict(X_save)
        precision, recall, _ = precision_recall_curve(y_save, best_model.predict_proba(X_save)[:, 1])
        auc_pr = auc(recall, precision)
        score = {
            'dataset': dataset_name_save,
            'state': i + 1,
            'precision': precision_score(y_save, y_pred),
            'recall': recall_score(y_save, y_pred),
            'f2': fbeta_score(y_save, y_pred, beta=2),
            'auc-pr': auc_pr
        }
        lg_scores.append(score)

        # Save the test set for later use
    lg_test_sets.append({'X_test': X_test, 'y_test': y_test, 'state': i + 1})


Random state 1
(61771, 18)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Random state 2
(61771, 18)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Random state 3
(61771, 18)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Random state 4
(61771, 18)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Random state 5
(61771, 18)
Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [6]:
lg_score = pd.DataFrame(lg_scores)
lg_score
lg_train_recall = lg_score[lg_score['dataset'] == 'train']['recall']
lg_val_recall = lg_score[lg_score['dataset'] == 'validation']['recall']
lg_test_recall = lg_score[lg_score['dataset'] == 'test']['recall']

print('train recall mean:',np.mean(lg_train_recall))
print('validation recall mean:',np.mean(lg_val_recall))
print('test recall mean:',np.mean(lg_test_recall))
print('test recall standard deviation:',np.std(lg_test_recall))
print(round((np.mean(lg_test_recall)-baseline_accuracy)/np.std(lg_test_recall),4),'standard deviations above the baseline')

"""High regularization in models can sometimes lead to lower performance on the training set because the model is penalized for complexity and thus is more generalized. This can result in slightly better performance on unseen data (validation set)."""

train recall mean: 0.790842955349396
validation recall mean: 0.7904685622747296
test recall mean: 0.7916700040048057
test recall standard deviation: 0.00889265294437917
79.9333 standard deviations above the baseline


'High regularization in models can sometimes lead to lower performance on the training set because the model is penalized for complexity and thus is more generalized. This can result in slightly better performance on unseen data (validation set).'

In [7]:
pd.DataFrame(lg_score)
lg_test_sets[1]['X_test']

Unnamed: 0,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
32795,Very Good,Within the past year,Yes,No,No,No,No,No,Male,55-59,167.0,76.00,27.25,No,2.0,5.0,12.0,1.0
265627,Very Good,Within the past 2 years,No,No,No,No,No,No,Male,45-49,185.0,92.53,26.91,No,0.0,4.0,4.0,8.0
114129,Excellent,Within the past 5 years,Yes,No,No,No,No,No,Male,25-29,178.0,61.23,19.37,No,8.0,12.0,8.0,4.0
61434,Excellent,Within the past year,Yes,No,No,No,No,No,Male,55-59,180.0,90.72,27.89,Yes,4.0,8.0,16.0,12.0
230570,Excellent,Within the past year,Yes,No,No,No,No,No,Female,18-24,168.0,54.43,19.37,No,10.0,12.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75070,Very Good,Within the past year,Yes,No,No,No,No,No,Female,65-69,160.0,52.16,20.37,No,2.0,4.0,9.0,3.0
154028,Very Good,Within the past year,Yes,Yes,Yes,No,No,Yes,Female,80+,157.0,58.97,23.78,No,0.0,30.0,8.0,4.0
136888,Excellent,Within the past 2 years,Yes,No,No,No,No,No,Male,55-59,168.0,70.76,25.18,No,0.0,30.0,4.0,1.0
53478,Good,Within the past year,Yes,No,No,No,No,Yes,Female,65-69,155.0,77.11,32.12,No,0.0,30.0,8.0,4.0


### Save for later use

In [8]:
import pickle
with open('lg_results.pkl', 'wb') as file:
    pickle.dump({'scores': lg_scores, 'models': lg_models, 'test_sets': lg_test_sets}, file)

In [9]:
with open('lg_results.pkl', 'rb') as file:
    data = pickle.load(file)
    lg_scores = data['scores']
    lg_models = data['models']
    lg_test_sets = data['test_sets']


## Correct Version

In [18]:
from sklearn.model_selection import PredefinedSplit

nr_states = 5
lg_scores = []
lg_models = []
lg_test_sets = []

for i in range(nr_states):
    print('Random state', i + 1)

    # Splitting the dataset
    X_train, X_other, y_train, y_other = train_test_split(X, y, train_size=0.6, stratify=y, random_state=56*i)
    X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, train_size=0.5, stratify=y_other, random_state=56*i)
    column_names = X.columns

    # Combine training and validation sets
    X_combined = np.concatenate([X_train, X_val])
    X_combined = pd.DataFrame(X_combined, columns=column_names)
    y_combined = np.concatenate([y_train, y_val])
    split_index = [-1]*len(X_train) + [0]*len(X_val)  # -1 for training, 0 for validation
    fixedval = PredefinedSplit(test_fold=split_index)

    print(X_test.shape)

    # Define the pipeline
    pipe = make_pipeline(prep, LogisticRegression(solver='saga', max_iter=100000, penalty='elasticnet'))

    # Define the parameter grid
    param_grid = {
        'logisticregression__l1_ratio': [0, 0.25, 0.5, 0.75, 1],
        'logisticregression__C': [0.01, 0.1, 1, 10, 100],
        'logisticregression__class_weight': [None, 'balanced']
    }

    # GridSearchCV with the pipeline
    grid = GridSearchCV(pipe, param_grid, cv=fixedval, scoring='recall', verbose=1, n_jobs=-1)
    grid.fit(X_combined, y_combined)

    # Best model
    best_model = grid.best_estimator_
    lg_models.append(best_model)

    # Evaluate and save the test set
    for X_save, y_save, dataset_name_save in [(X_train, y_train, 'train'), (X_val, y_val, 'validation'), (X_test, y_test, 'test')]:
        y_pred = best_model.predict(X_save)
        precision, recall, _ = precision_recall_curve(y_save, best_model.predict_proba(X_save)[:, 1])
        auc_pr = auc(recall, precision)
        score = {
            'dataset': dataset_name_save,
            'state': i + 1,
            'precision': precision_score(y_save, y_pred),
            'recall': recall_score(y_save, y_pred),
            'f2': fbeta_score(y_save, y_pred, beta=2),
            'auc-pr': auc_pr
        }
        lg_scores.append(score)

    # Save the test set for later use
    lg_test_sets.append({'X_test': X_test, 'y_test': y_test, 'state': i + 1})


Random state 1
(61771, 18)
Fitting 1 folds for each of 50 candidates, totalling 50 fits
Random state 2
(61771, 18)
Fitting 1 folds for each of 50 candidates, totalling 50 fits
Random state 3
(61771, 18)
Fitting 1 folds for each of 50 candidates, totalling 50 fits
Random state 4
(61771, 18)
Fitting 1 folds for each of 50 candidates, totalling 50 fits
Random state 5
(61771, 18)
Fitting 1 folds for each of 50 candidates, totalling 50 fits


In [19]:
pd.DataFrame(lg_score)
lg_test_sets[1]['X_test']

Unnamed: 0,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
32795,Very Good,Within the past year,Yes,No,No,No,No,No,Male,55-59,167.0,76.00,27.25,No,2.0,5.0,12.0,1.0
265627,Very Good,Within the past 2 years,No,No,No,No,No,No,Male,45-49,185.0,92.53,26.91,No,0.0,4.0,4.0,8.0
114129,Excellent,Within the past 5 years,Yes,No,No,No,No,No,Male,25-29,178.0,61.23,19.37,No,8.0,12.0,8.0,4.0
61434,Excellent,Within the past year,Yes,No,No,No,No,No,Male,55-59,180.0,90.72,27.89,Yes,4.0,8.0,16.0,12.0
230570,Excellent,Within the past year,Yes,No,No,No,No,No,Female,18-24,168.0,54.43,19.37,No,10.0,12.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75070,Very Good,Within the past year,Yes,No,No,No,No,No,Female,65-69,160.0,52.16,20.37,No,2.0,4.0,9.0,3.0
154028,Very Good,Within the past year,Yes,Yes,Yes,No,No,Yes,Female,80+,157.0,58.97,23.78,No,0.0,30.0,8.0,4.0
136888,Excellent,Within the past 2 years,Yes,No,No,No,No,No,Male,55-59,168.0,70.76,25.18,No,0.0,30.0,4.0,1.0
53478,Good,Within the past year,Yes,No,No,No,No,Yes,Female,65-69,155.0,77.11,32.12,No,0.0,30.0,8.0,4.0


In [20]:
lg_score = pd.DataFrame(lg_scores)
lg_score
lg_train_recall = lg_score[lg_score['dataset'] == 'train']['recall']
lg_val_recall = lg_score[lg_score['dataset'] == 'validation']['recall']
lg_test_recall = lg_score[lg_score['dataset'] == 'test']['recall']

print('train recall mean:',np.mean(lg_train_recall))
print('validation recall mean:',np.mean(lg_val_recall))
print('test recall mean:',np.mean(lg_test_recall))
print('test recall standard deviation:',np.std(lg_test_recall))
print(round((np.mean(lg_test_recall)-baseline_accuracy)/np.std(lg_test_recall),4),'standard deviations above the baseline')

"""High regularization in models can sometimes lead to lower performance on the training set because the model is penalized for complexity and thus is more generalized. This can result in slightly better performance on unseen data (validation set)."""

train recall mean: 0.7906026830407795
validation recall mean: 0.7902282739287144
test recall mean: 0.7912294753704445
test recall standard deviation: 0.008056132421279184
88.1787 standard deviations above the baseline


'High regularization in models can sometimes lead to lower performance on the training set because the model is penalized for complexity and thus is more generalized. This can result in slightly better performance on unseen data (validation set).'

In [21]:
import pickle
with open('lg_results_final.pkl', 'wb') as file:
    pickle.dump({'scores': lg_scores, 'models': lg_models, 'test_sets': lg_test_sets}, file)

In [22]:
with open('lg_results_final.pkl', 'rb') as file:
    data = pickle.load(file)
    lg_scores = data['scores']
    lg_models = data['models']
    lg_test_sets = data['test_sets']
