# IMPORTING REQUIRED LIBRARIES

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler 
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report

# LOADING DATASET

In [2]:
data=pd.read_csv('data.csv')

# PREPROCESSING 

In [3]:
data = data.replace({
    'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0,
    'No, borderline diabetes': '0',
    'Yes (during pregnancy)': '1'
})

  data = data.replace({


In [4]:
data['Diabetic'] = data['Diabetic'].astype(int)

In [5]:
age_order = [
    '18-24', '25-29', '30-34', '35-39', '40-44', '45-49', 
    '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older'
]
ordinal_encoder = OrdinalEncoder(categories=[age_order])
data['AgeCategory'] = ordinal_encoder.fit_transform(data[['AgeCategory']])

In [6]:
data = pd.get_dummies(data, columns=['Race', 'GenHealth'], drop_first=True)

In [7]:
correlation_matrix = data.corr()
correlation_with_target = correlation_matrix['HeartDisease'].sort_values(ascending=False)
print("Correlation of features with 'HeartDisease':")
print(correlation_with_target)

Correlation of features with 'HeartDisease':
HeartDisease           1.000000
AgeCategory            0.423665
DiffWalking            0.292551
Diabetic               0.271339
PhysicalHealth         0.238557
Stroke                 0.237228
GenHealth_Fair         0.211676
GenHealth_Poor         0.207479
KidneyDisease          0.183720
Smoking                0.182721
Sex                    0.126838
Race_White             0.120470
BMI                    0.120264
SkinCancer             0.119374
GenHealth_Good         0.075173
Asthma                 0.059514
MentalHealth           0.046395
SleepTime             -0.000654
Race_Other            -0.003094
Race_Black            -0.008838
Race_Asian            -0.057138
AlcoholDrinking       -0.062526
Race_Hispanic         -0.121603
PhysicalActivity      -0.175646
GenHealth_Very good   -0.186716
Name: HeartDisease, dtype: float64


In [8]:
significant_features = correlation_with_target[abs(correlation_with_target) > 0.1].index.tolist()
if 'HeartDisease' in significant_features:
    significant_features.remove('HeartDisease')

In [9]:
X = data[significant_features]
y = data['HeartDisease']

In [10]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [11]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
print(Counter(y_train))

Counter({1: 19020, 0: 19020})


In [12]:
X_train, X_val = X_train.align(X_val, join='left', axis=1, fill_value=0)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [14]:
# hyperparameter tuning using grid search
param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test, grid_predictions))
print(classification_report(y_test, grid_predictions))
print(accuracy_score(y_test, grid_predictions))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.710 total time=  34.3s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.708 total time=  36.5s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.709 total time=  35.8s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.705 total time=  36.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.707 total time=  37.7s
[CV 1/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.744 total time=23.5min
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.748 total time=23.2min
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.753 total time=23.9min
[CV 4/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.749 total time=24.1min


In [38]:
svm_model = SVC(C=10, gamma=0.001, kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)
y_train_pred = svm_model.predict(X_train)
y_val_pred = svm_model.predict(X_val)
y_test_pred = svm_model.predict(X_test)

In [39]:
print("Accuracy on training set: ", svm_model.score(X_train, y_train))
print("Accuracy on validation set: ", svm_model.score(X_val, y_val))
print("Accuracy on test set: ", svm_model.score(X_test, y_test))

print("\nClassification Report (Train):\n", classification_report(y_train, y_train_pred))
print("\nConfusion Matrix (Train):\n", confusion_matrix(y_train, y_train_pred))
print("\nClassification Report (Validation):\n", classification_report(y_val, y_val_pred))
print("\nConfusion Matrix (Validation):\n", confusion_matrix(y_val, y_val_pred))
print("\nClassification Report (Test):\n", classification_report(y_test, y_test_pred))
print("\nConfusion Matrix (Test):\n", confusion_matrix(y_test, y_test_pred))

Accuracy on training set:  0.753391167192429
Accuracy on validation set:  0.755290333502624
Accuracy on test set:  0.7594379549686813

Classification Report (Train):
               precision    recall  f1-score   support

           0       0.77      0.72      0.74     19020
           1       0.74      0.79      0.76     19020

    accuracy                           0.75     38040
   macro avg       0.75      0.75      0.75     38040
weighted avg       0.75      0.75      0.75     38040


Confusion Matrix (Train):
 [[13654  5366]
 [ 4015 15005]]

Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.80      0.72      0.76      6268
           1       0.71      0.80      0.75      5546

    accuracy                           0.76     11814
   macro avg       0.76      0.76      0.76     11814
weighted avg       0.76      0.76      0.76     11814


Confusion Matrix (Validation):
 [[4492 1776]
 [1115 4431]]

Classification Report

In [272]:
rf = RandomForestClassifier(n_estimators=50, max_depth=9, random_state=42)
rf.fit(X_train, y_train)
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
y_test_pred = rf.predict(X_test)

In [273]:
print("Accuracy on training set: ", rf.score(X_train, y_train))
print("Accuracy on validation set: ", rf.score(X_val, y_val))
print("Accuracy on test set: ", rf.score(X_test, y_test))

print("\nClassification Report (Train):\n", classification_report(y_train, y_train_pred))
print("\nConfusion Matrix (Train):\n", confusion_matrix(y_train, y_train_pred))
print("\nClassification Report (Validation):\n", classification_report(y_val, y_val_pred))
print("\nConfusion Matrix (Validation):\n", confusion_matrix(y_val, y_val_pred))
print("\nClassification Report (Test):\n", classification_report(y_test, y_test_pred))
print("\nConfusion Matrix (Test):\n", confusion_matrix(y_test, y_test_pred))

Accuracy on training set:  0.7629796839729119
Accuracy on validation set:  0.7539360081259523
Accuracy on test set:  0.760284408329101

Classification Report (Train):
               precision    recall  f1-score   support

           0       0.79      0.77      0.78     19020
           1       0.74      0.76      0.75     16420

    accuracy                           0.76     35440
   macro avg       0.76      0.76      0.76     35440
weighted avg       0.76      0.76      0.76     35440


Confusion Matrix (Train):
 [[14620  4400]
 [ 4000 12420]]

Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.77      0.76      0.77      6268
           1       0.73      0.75      0.74      5546

    accuracy                           0.75     11814
   macro avg       0.75      0.75      0.75     11814
weighted avg       0.75      0.75      0.75     11814


Confusion Matrix (Validation):
 [[4746 1522]
 [1385 4161]]

Classification Repor