In [1]:
import pandas as pd

DATA = '/kaggle/input/heart-attack-risk-assessment-dataset/updated_version.csv'
df = pd.read_csv(filepath_or_buffer=DATA)
df.head()

Unnamed: 0,age,sex,total_cholesterol,ldl,hdl,systolic_bp,diastolic_bp,smoking,diabetes,heart_attack
0,57,1,229.463642,175.879129,39.225687,124.070127,91.37878,0,0,0
1,58,1,186.46412,128.984916,34.950968,95.492552,64.35504,1,0,0
2,37,1,251.300719,152.347592,45.913288,99.519335,64.953147,0,1,0
3,55,1,192.058908,116.803684,67.208925,122.460002,73.821382,0,0,0
4,53,1,151.203448,107.017396,60.693838,123.022257,81.121946,0,1,0


We know our data is synthetic, so we expect there to be essentially no predictive signal in the data. Let's do some EDA on the features first.

In [2]:
from plotly import express
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)

express.histogram(data_frame=df, x='age', facet_col='sex').show(renderer='iframe_connected')

Our age data looks normally distributed.

In [3]:
express.histogram(data_frame=df, x='total_cholesterol', facet_col='sex').show(renderer='iframe_connected')

In [4]:
express.histogram(data_frame=df, x='ldl', facet_col='sex').show(renderer='iframe_connected')

In [5]:
express.histogram(data_frame=df, x='hdl', facet_col='sex').show(renderer='iframe_connected')

Our cholesterol data is also roughly normally distributed.

In [6]:
express.histogram(data_frame=df, x='systolic_bp', facet_col='sex').show(renderer='iframe_connected')

In [7]:
express.histogram(data_frame=df, x='diastolic_bp', facet_col='sex').show(renderer='iframe_connected')

Our blood pressure data is also rougly normally distributed.

In [8]:
df['smoking'].value_counts().to_frame().T

smoking,0,1
count,798,202


In [9]:
df['diabetes'].value_counts().to_frame().T

diabetes,0,1
count,910,90


Our smoking and diabetes data looks like it was meant to be 80/20 and 90/10 or so, respectively.

Let's use dimensionality reduction to see if our data clusters at all, and if so if those clusters align with our target variable.

In [10]:
from plotly import express
from sklearn.manifold import TSNE

COLUMNS = ['age', 'sex', 'total_cholesterol', 'ldl', 'hdl', 'systolic_bp', 'diastolic_bp', 'smoking', 'diabetes',]
RANDOM_STATE = 2025
TARGET = 'heart_attack'

reducer = TSNE(random_state=RANDOM_STATE)
reduced_df = pd.DataFrame(columns=['x', 'y'], data=reducer.fit_transform(X=df[COLUMNS]))
reduced_df[TARGET] = df[TARGET].tolist()

express.scatter(data_frame=reduced_df, x='x', y='y', color=TARGET).show(renderer='iframe_connected')

Our data looks fairly randomly distributed with maybe some local clustering. Let's build a model and see what happens.

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=RANDOM_STATE, stratify=df[TARGET])
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X=X_train, y=y_train)
y_pred = knn.predict(X=X_test)

print(classification_report(y_true=y_test, y_pred=y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.90      1.00      0.94       179
           1       0.00      0.00      0.00        21

    accuracy                           0.90       200
   macro avg       0.45      0.50      0.47       200
weighted avg       0.80      0.90      0.85       200



Our model acts like it almost always predicts class 0. What would it look like if we had a model that always predicted class 0?

In [12]:
df[TARGET].value_counts().to_frame().T

heart_attack,0,1
count,896,104


In [13]:
print(classification_report(zero_division=0, y_true=y_test, y_pred=([0] * len(y_test))))

              precision    recall  f1-score   support

           0       0.90      1.00      0.94       179
           1       0.00      0.00      0.00        21

    accuracy                           0.90       200
   macro avg       0.45      0.50      0.47       200
weighted avg       0.80      0.90      0.85       200



Our model is, not surprisingly, indistinguishable from a dummy model.