## Heart attack risk prediction



In [4]:
#!pip install xgboost

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import seaborn as sns
import pickle

In [6]:
df = pd.read_csv('heart_attack_prediction_dataset.csv')

In [7]:
df.head()

Unnamed: 0,Heart Rate,Oxygen Level,Systolic BP,Diastolic BP,Body Temperature,Heart Attack Risk
0,94.057309,82.28809,148.520782,96.05421,101.393674,High Risk
1,86.218737,99.350164,104.080648,83.746346,99.342905,Low Risk
2,98.358473,88.627742,161.040283,98.157421,100.486419,High Risk
3,70.394968,98.654891,117.246391,70.330055,98.121317,Low Risk
4,62.798371,95.317593,117.201748,75.925147,98.855159,Low Risk


In [8]:
df = df.dropna()

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Heart Rate         20000 non-null  float64
 1   Oxygen Level       20000 non-null  float64
 2   Systolic BP        20000 non-null  float64
 3   Diastolic BP       20000 non-null  float64
 4   Body Temperature   20000 non-null  float64
 5   Heart Attack Risk  20000 non-null  object 
dtypes: float64(5), object(1)
memory usage: 937.6+ KB


In [10]:
df.head()

Unnamed: 0,Heart Rate,Oxygen Level,Systolic BP,Diastolic BP,Body Temperature,Heart Attack Risk
0,94.057309,82.28809,148.520782,96.05421,101.393674,High Risk
1,86.218737,99.350164,104.080648,83.746346,99.342905,Low Risk
2,98.358473,88.627742,161.040283,98.157421,100.486419,High Risk
3,70.394968,98.654891,117.246391,70.330055,98.121317,Low Risk
4,62.798371,95.317593,117.201748,75.925147,98.855159,Low Risk


In [11]:
df.describe()

Unnamed: 0,Heart Rate,Oxygen Level,Systolic BP,Diastolic BP,Body Temperature
count,20000.0,20000.0,20000.0,20000.0,20000.0
mean,82.55699,92.961272,135.041389,87.525062,99.551289
std,15.960827,6.288189,18.076023,9.039751,1.239251
min,31.436247,69.724567,78.422661,57.685157,96.747425
25%,70.002081,87.97122,120.037204,79.965473,98.562437
50%,82.593428,95.123447,135.101234,87.603329,99.2362
75%,95.131451,98.084141,150.102482,95.118387,100.49234
max,134.262377,106.458123,185.365317,112.68024,104.329782


In [12]:
features = ['Heart Rate', 'Body Temperature', 'Oxygen Level',
            'Systolic BP', 'Diastolic BP']

In [13]:
target_column = 'Heart Attack Risk'
X = df[features]
y = df[target_column]

In [14]:
print(y.value_counts())

Heart Attack Risk
High Risk    10000
Low Risk     10000
Name: count, dtype: int64


In [15]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [16]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [17]:
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_val_scaled = scaler.transform(X_val)
#X_test_scaled = scaler.transform(X_test)

In [18]:
model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

[0]	validation_0-logloss:0.60052
[1]	validation_0-logloss:0.52416
[2]	validation_0-logloss:0.46016
[3]	validation_0-logloss:0.40636
[4]	validation_0-logloss:0.36036
[5]	validation_0-logloss:0.32062
[6]	validation_0-logloss:0.28625
[7]	validation_0-logloss:0.25613
[8]	validation_0-logloss:0.22960
[9]	validation_0-logloss:0.20593
[10]	validation_0-logloss:0.18524
[11]	validation_0-logloss:0.16686
[12]	validation_0-logloss:0.15059
[13]	validation_0-logloss:0.13617
[14]	validation_0-logloss:0.12318
[15]	validation_0-logloss:0.11170
[16]	validation_0-logloss:0.10118
[17]	validation_0-logloss:0.09210
[18]	validation_0-logloss:0.08361
[19]	validation_0-logloss:0.07605
[20]	validation_0-logloss:0.06923
[21]	validation_0-logloss:0.06324
[22]	validation_0-logloss:0.05762
[23]	validation_0-logloss:0.05264
[24]	validation_0-logloss:0.04805
[25]	validation_0-logloss:0.04397
[26]	validation_0-logloss:0.04038
[27]	validation_0-logloss:0.03705
[28]	validation_0-logloss:0.03404
[29]	validation_0-loglos

In [19]:
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')

In [20]:
print("Cross-validation scores:", cv_scores)
print("\nMean accuracy:", np.mean(cv_scores))
print("\nStandard deviation:", np.std(cv_scores))

Cross-validation scores: [0.99833333 0.99916667 0.99916667 0.99916667 0.99916667 0.99916667
 0.99916667 0.9975     1.         0.9975    ]

Mean accuracy: 0.9988333333333335

Standard deviation: 0.0007637626158259522


In [21]:
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)
print("\nValidation Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nValidation Classification Report:\n", classification_report(y_val, y_pred))

Validation Accuracy: 0.99875

Validation Confusion Matrix:
 [[1954    2]
 [   3 2041]]

Validation Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1956
           1       1.00      1.00      1.00      2044

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000



In [None]:
y_test_pred = model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)
print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))
print("\nTest Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Test Accuracy: 0.99925

Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2040
           1       1.00      1.00      1.00      1960

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000


Test Confusion Matrix:
 [[2039    1]
 [   2 1958]]


In [24]:
pickle.dump(model, open('heart_attack_prediction.pkl', 'wb'))