In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("..//Datasets//heart_attack_prediction_dataset.csv")

In [None]:
df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [None]:

# Drop non-predictive or high-cardinality features
df = df.drop(columns=['Patient ID', 'Country', 'Continent', 'Hemisphere'])

In [None]:
# One-hot encode categorical columns
df = pd.get_dummies(df, columns=['Sex', 'Blood Pressure', 'Diet'], drop_first=True)

In [None]:
## Categorical columns... 
categorical_columns = [columns for columns in df.columns if len(df[columns].unique()) <= 10]
categorical_columns

['Diabetes',
 'Family History',
 'Smoking',
 'Obesity',
 'Alcohol Consumption',
 'Previous Heart Problems',
 'Medication Use',
 'Stress Level',
 'Physical Activity Days Per Week',
 'Sleep Hours Per Day',
 'Heart Attack Risk',
 'Sex_Male',
 'Blood Pressure_100/102',
 'Blood Pressure_100/103',
 'Blood Pressure_100/104',
 'Blood Pressure_100/105',
 'Blood Pressure_100/106',
 'Blood Pressure_100/107',
 'Blood Pressure_100/108',
 'Blood Pressure_100/109',
 'Blood Pressure_100/110',
 'Blood Pressure_100/60',
 'Blood Pressure_100/61',
 'Blood Pressure_100/63',
 'Blood Pressure_100/64',
 'Blood Pressure_100/65',
 'Blood Pressure_100/66',
 'Blood Pressure_100/67',
 'Blood Pressure_100/68',
 'Blood Pressure_100/69',
 'Blood Pressure_100/71',
 'Blood Pressure_100/72',
 'Blood Pressure_100/73',
 'Blood Pressure_100/74',
 'Blood Pressure_100/75',
 'Blood Pressure_100/76',
 'Blood Pressure_100/78',
 'Blood Pressure_100/79',
 'Blood Pressure_100/80',
 'Blood Pressure_100/81',
 'Blood Pressure_100/83'

In [None]:
for feature in categorical_columns:
    print(df[feature].value_counts())

Diabetes
1    5716
0    3047
Name: count, dtype: int64
Family History
0    4443
1    4320
Name: count, dtype: int64
Smoking
1    7859
0     904
Name: count, dtype: int64
Obesity
1    4394
0    4369
Name: count, dtype: int64
Alcohol Consumption
1    5241
0    3522
Name: count, dtype: int64
Previous Heart Problems
0    4418
1    4345
Name: count, dtype: int64
Medication Use
0    4396
1    4367
Name: count, dtype: int64
Stress Level
2     913
4     910
7     903
9     887
8     879
3     868
1     865
5     860
6     855
10    823
Name: count, dtype: int64
Physical Activity Days Per Week
3    1143
1    1121
2    1109
7    1095
5    1079
4    1077
6    1074
0    1065
Name: count, dtype: int64
Sleep Hours Per Day
10    1293
8     1288
6     1276
7     1270
5     1263
9     1192
4     1181
Name: count, dtype: int64
Heart Attack Risk
0    5624
1    3139
Name: count, dtype: int64
Sex_Male
True     6111
False    2652
Name: count, dtype: int64
Blood Pressure_100/102
False    8762
True        1
N

In [None]:
numerical_columns = [columns for columns in df.columns if columns not in categorical_columns]
numerical_columns

['Age',
 'Cholesterol',
 'Heart Rate',
 'Exercise Hours Per Week',
 'Sedentary Hours Per Day',
 'Income',
 'BMI',
 'Triglycerides']

In [None]:

# Split features and label
X = df.drop(columns=['Heart Attack Risk'])
y = df['Heart Attack Risk']

In [None]:
y.unique()

array([0, 1], dtype=int64)

In [None]:
y = y.astype(int) - 1

In [None]:
y.unique()

array([-1,  0])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("StandardScaler", numeric_transformer, numerical_columns)
    ],
    remainder='passthrough'
)

In [None]:
preprocessor

In [None]:
X = df.drop(columns=['Heart Attack Risk'])  # Keep it a DataFrame
y = df['Heart Attack Risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Now works fine with column-based transformers
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)


In [None]:
import pickle
import os

# Relative path from notebooks/ to models/
save_path = '../models/preprocessor.pkl'

# Ensure directory exists (optional)
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Save preprocessor
with open(save_path, 'wb') as f:
    pickle.dump(preprocessor, f)

## Model Building for Prediction

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score