In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [2]:
df = pd.read_csv('heart_attack_prediction_dataset.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'heart_attack_prediction_dataset.csv'

In [None]:
df = df.drop('Patient ID', axis=1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

In [None]:
df['Sex'].value_counts()

In [None]:
df['Continent'].value_counts()

In [None]:
df['Smoking'].value_counts()

In [None]:
df['Heart Attack Risk'].value_counts()

In [None]:
df['Blood Pressure']

In [None]:
df[['Max Blood Pressure', 'Min Blood Pressure']] = df['Blood Pressure'].str.split('/', expand=True)
df['Max Blood Pressure'] = df['Max Blood Pressure'].astype('int32')
df['Min Blood Pressure'] = df['Min Blood Pressure'].astype('int32')
df.drop('Blood Pressure', axis=1, inplace=True)

In [None]:
numerical_columns = df.select_dtypes(include=np.number).columns.drop('Heart Attack Risk')
nonbinary_columns = [x for x in numerical_columns if df[x].nunique() != 2]
binary_columns = [x for x in numerical_columns if df[x].nunique() == 2]
categorical_columns = df.select_dtypes(exclude=np.number).columns
print(numerical_columns)
print(nonbinary_columns)
print(binary_columns)
print(categorical_columns)

In [None]:
df_risk = df[df['Heart Attack Risk'] == 1]

In [None]:
df[numerical_columns].groupby(df['Heart Attack Risk']).mean().transpose()

In [None]:
plt.figure(figsize=(20, 15))
for i, col in enumerate(nonbinary_columns, 1):
  plt.subplot(8, 2, i)
  sns.histplot(df_risk[col], kde=True)
  plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20, 15))
for i, col in enumerate(categorical_columns, 1):
  plt.subplot(3, 2, i)
  sns.countplot(df, x=col, hue='Heart Attack Risk')
  plt.xticks(rotation=45)
  plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20, 15))
for i, col in enumerate(categorical_columns, 1):
  plt.subplot(3, 2, i)
  plt.bar(df_risk[col].value_counts().index, df_risk[col].value_counts()/8763*100)
  plt.xticks(rotation=45)
  plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20, 15))
for i, col in enumerate(binary_columns, 1):
  plt.subplot(4, 2, i)
  plt.pie(df[col].value_counts(), labels=['Yes', 'No'], autopct='%1.1f%%')
  plt.title(f"{col}")
  plt.tight_layout()
plt.show()

In [None]:
df['Diet'] = df['Diet'].map({'Average':0, 'Unhealthy':1, 'Healthy':2}).astype('int32')

In [None]:
df = pd.get_dummies(df, columns=categorical_columns.drop('Diet'))

In [None]:
bool_columns = df.select_dtypes(include='bool').columns
df[bool_columns] = df[bool_columns].astype('int32')

In [None]:
(np.std(df[nonbinary_columns], axis=0)/df[nonbinary_columns].mean()).sort_values(ascending=False)

In [None]:
df['Active Hours'] = 24 - df['Sedentary Hours Per Day']

In [None]:
cols_to_scale = df[nonbinary_columns].columns
x = df.drop('Heart Attack Risk', axis=1)
y = df['Heart Attack Risk']

In [None]:
df['Heart Attack Risk'].value_counts()

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 42)

X_smote, y_smote = smote.fit_resample(x, y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2)

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

scaler.fit(X_train[cols_to_scale])

X_train[cols_to_scale] = scaler.transform(X_train[cols_to_scale])

X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
nb = GaussianNB()

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
knn.fit(X_train, y_train)
dt.fit(X_train, y_train)
nb.fit(X_train, y_train)



In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
model_evaluation = {}
models = [lr, dt, rf, nb, knn]
model_results = {}

# Make Predictions
for model in models:
    model_results[str(model).split("(")[0]] = [model.predict(X_test)]
for model, preds in model_results.items():
    model_evaluation[model] = [
                            round(accuracy_score(y_test, pd.DataFrame(preds).T) * 100, 2),
                            round(f1_score(y_test, pd.DataFrame(preds).T) * 100, 2),
                            round(precision_score(y_test, pd.DataFrame(preds).T) * 100, 2),
                            round(recall_score(y_test, pd.DataFrame(preds).T) * 100, 2),
    ]
results_df = pd.DataFrame(model_evaluation, index=["Accuracy", "F-1 Score", "Precision Score", "Recall Score"])
results_df.style.background_gradient(axis=None, cmap='pink')