In [1]:
from __future__ import print_function
from packaging.version import parse as Version
from platform import python_version

OK = '\x1b[42m[ OK ]\x1b[0m'
FAIL = "\x1b[41m[FAIL]\x1b[0m"

try:
    import importlib
except ImportError:
    print(FAIL, "Python version 3.11 is required,"
                " but %s is installed." % sys.version)

def import_version(pkg, min_ver, fail_msg=""):
    mod = None
    try:
        mod = importlib.import_module(pkg)
        if pkg in {'PIL'}:
            ver = mod.VERSION
        else:
            ver = mod.__version__
        if Version(ver) == Version(min_ver):
            print(OK, "%s version %s is installed."
                  % (lib, min_ver))
        else:
            print(FAIL, "%s version %s is required, but %s installed."
                  % (lib, min_ver, ver))
    except ImportError:
        print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
    return mod


# first check the python version
pyversion = Version(python_version())

if pyversion >= Version("3.11.4"):
    print(OK, "Python version is %s" % pyversion)
elif pyversion < Version("3.11"):
    print(FAIL, "Python version 3.11 is required,"
                " but %s is installed." % pyversion)
else:
    print(FAIL, "Unknown Python version: %s" % pyversion)


print()
requirements = {'numpy': "1.24.4", 'matplotlib': "3.7.2",'sklearn': "1.3.0",
                'pandas': "2.0.3",'xgboost': "1.7.6", 'shap': "0.42.1", 'seaborn': "0.12.2"}

# now the dependencies
for lib, required_version in list(requirements.items()):
    import_version(lib, required_version)

[42m[ OK ][0m Python version is 3.11.4

[42m[ OK ][0m numpy version 1.24.4 is installed.
[42m[ OK ][0m matplotlib version 3.7.2 is installed.
[42m[ OK ][0m sklearn version 1.3.0 is installed.
[42m[ OK ][0m pandas version 2.0.3 is installed.
[42m[ OK ][0m xgboost version 1.7.6 is installed.


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


[42m[ OK ][0m shap version 0.42.1 is installed.
[42m[ OK ][0m seaborn version 0.12.2 is installed.


In [2]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import PredefinedSplit

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, fbeta_score, auc,
    precision_recall_curve, average_precision_score
)

import warnings
import pickle

In [3]:
df = pd.read_csv('../data/CVD_cleaned.csv')
df.head()

random_state = 56

X = df.drop(labels=['Heart_Disease'], axis=1)
y = df['Heart_Disease']

label_mapping = {'No': 0, 'Yes': 1}
y = y.map(label_mapping)

X_subset, _, y_subset, _ = train_test_split(X, y, test_size=0.6, stratify=y, random_state=56)

print("Full Set")
print('X:',X.shape)
print('y:',y.shape)
print("Subset Set")
print('X_subset:',X_subset.shape)
print('y_subset:',y_subset.shape)

cat_ftrs = ['Checkup','Exercise','Skin_Cancer','Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex','Smoking_History']
ordinal_ftrs = ['General_Health','Age_Category',]
ordinal_cats = [['Poor','Fair','Good','Very Good','Excellent'],\
               ['18-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79','80+']]
num_ftrs = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption',
       'Green_Vegetables_Consumption', 'FriedPotato_Consumption']

# one-hot encoder
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# ordinal encoder
ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories = ordinal_cats))])

# standard scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs),
        ('ord', ordinal_transformer, ordinal_ftrs)])
final_scaler = StandardScaler()
prep = Pipeline(steps=[('preprocessor', preprocessor)])

Full Set
X: (308854, 18)
y: (308854,)
Subset Set
X_subset: (123541, 18)
y_subset: (123541,)


In [7]:
#save Subset
subset = pd.concat([X_subset, y_subset], axis = 1).to_csv('../data/svm_subset.csv')

### baseline recall

In [9]:
df = pd.read_csv('../data/svm_subset.csv')
X_subset = df.drop(labels=['Heart_Disease'], axis=1)
y_subset = df['Heart_Disease']

In [10]:
N, Y = df['Heart_Disease'].value_counts()
baseline_recall = y_subset/(N+Y)

### SVM Classifier

In [128]:
nr_states = 5
svc_scores = []
svc_models = []
svc_test_sets = []

for i in range(nr_states):
    print('Random state', i + 1)

    X_train, X_other, y_train, y_other = train_test_split(X_subset, y_subset, train_size=0.6, stratify=y_subset, random_state=56*i)
    X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, train_size=0.5, stratify=y_other, random_state=56*i)
    column_names = X.columns

    X_combined = np.concatenate([X_train, X_val])
    X_combined = pd.DataFrame(X_combined, columns=column_names)
    y_combined = np.concatenate([y_train, y_val])
    split_index = [-1]*len(X_train) + [0]*len(X_val)
    fixedval = PredefinedSplit(test_fold=split_index)

    print(X_test.shape)

    pipe = make_pipeline(prep, SVC(probability=True, random_state=56*i))

    param_grid = {
        'svc__C': [0.01, 0.1, 1],
        'svc__gamma': [0.001, 0.01],
        'svc__class_weight': ['balanced']
    }

    grid = GridSearchCV(pipe, param_grid, cv=fixedval, scoring='recall', verbose=1, n_jobs=-1)
    grid.fit(X_combined, y_combined)

    best_model = grid.best_estimator_
    svc_models.append(best_model)
    
    for X_save, y_save, dataset_name_save in [(X_train, y_train, 'train'), (X_val, y_val, 'validation'), (X_test, y_test, 'test')]:
        y_pred = best_model.predict(X_save)
        precision, recall, _ = precision_recall_curve(y_save, best_model.predict_proba(X_save)[:, 1])
        auc_pr = auc(recall, precision)
        score = {
            'dataset': dataset_name_save,
            'state': i + 1,
            'precision': precision_score(y_save, y_pred, zero_division=0),
            'recall': recall_score(y_save, y_pred),
            'f2': fbeta_score(y_save, y_pred, beta=2),
            'auc-pr': auc_pr
        }
        svc_scores.append(score)

    svc_test_sets.append({'X_test': X_test, 'y_test': y_test, 'state': i + 1})

Random state 1
(24709, 18)
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Random state 2
(24709, 18)
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Random state 3
(24709, 18)
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Random state 4
(24709, 18)
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Random state 5
(24709, 18)
Fitting 1 folds for each of 6 candidates, totalling 6 fits


In [129]:
svc_score = pd.DataFrame(svc_scores)
svc_score
svc_train_recall = svc_score[svc_score['dataset'] == 'train']['recall']
svc_val_recall = svc_score[svc_score['dataset'] == 'validation']['recall']
svc_test_recall = svc_score[svc_score['dataset'] == 'test']['recall']

print('train recall mean:',np.mean(svc_train_recall))
print('validation recall mean:',np.mean(svc_train_recall))
print('test recall mean:',np.mean(svc_test_recall))
print('test recall standard deviation:',np.std(svc_test_recall))
print(round((np.mean(svc_test_recall)-baseline_recall)/np.std(svc_test_recall),4),'standard deviations above the baseline')

train recall mean: 0.8589687969297515
validation recall mean: 0.8589687969297515
test recall mean: 0.8576576576576576
test recall standard deviation: 0.005358862388117758
144.9575 standard deviations above the baseline


In [130]:
pd.DataFrame(svc_scores)

Unnamed: 0,dataset,state,precision,recall,f2,auc-pr
0,train,1,0.170297,0.855832,0.474117,0.279777
1,validation,1,0.171491,0.863295,0.477801,0.287455
2,test,1,0.171978,0.858859,0.477462,0.28129
3,train,2,0.171017,0.859336,0.476094,0.278594
4,validation,2,0.168365,0.852278,0.470244,0.282467
5,test,2,0.170865,0.858859,0.475742,0.284166
6,train,3,0.169733,0.854497,0.472914,0.280559
7,validation,3,0.173685,0.861292,0.480689,0.282697
8,test,3,0.17007,0.862863,0.475481,0.283596
9,train,4,0.1701,0.858335,0.474425,0.28687


In [131]:
with open('svc_results_24709.pkl', 'wb') as file:
    pickle.dump({'scores': svc_scores, 'models': svc_models, 'test_sets': svc_test_sets}, file)