In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv('/content/heart_attack_prediction_dataset.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/heart_attack_prediction_dataset.csv'

# introduction
    This synthetic dataset provides a comprehensive array of features relevant to heart health and lifestyle choices, encompassing patient-specific details such as age, gender, cholesterol levels, blood pressure, heart rate, and indicators like diabetes, family history, smoking habits, obesity, and alcohol consumption. Additionally, lifestyle factors like exercise hours, dietary habits, stress levels, and sedentary hours are included. Medical aspects comprising previous heart problems, medication usage, and triglyceride levels are considered. Socioeconomic aspects such as income and geographical attributes like country, continent, and hemisphere are incorporated. The dataset, consisting of 8763 records from patients around the globe, culminates in a crucial binary classification feature denoting the presence or absence of a heart attack risk, providing a comprehensive resource for predictive analysis and research in cardiovascular health.

### Dataset Glossary (Column-wise- )
    - Patient ID: Unique identifier for each patient
    - Age: Age of the patient
    - Sex: Gender of the patient (Male/Female)
    - Cholesterol: Cholesterol levels of the patient
    - Blood Pressure: Blood pressure of the patient (systolic/diastolic)
    - Heart Rate: The heart rate of the patient
    - Diabetes: Whether the patient has diabetes (Yes/No)
    - Family History: Family history of heart-related problems (1: Yes, 0: No)
    - Smoking: Smoking status of the patient (1: Smoker, 0: Nonsmoker)
    - Obesity: Obesity status of the patient (1: Obese, 0: Not obese)
    - Alcohol Consumption: Level of alcohol consumption by the patient (None/Light/Moderate/Heavy)
    - Exercise Hours Per Week: Number of exercise hours per week
    - Diet: Dietary habits of the patient (Healthy/Average/Unhealthy)
    - Previous Heart Problems: Previous heart problems of the patient (1:Yes, 0:No)
    - Medication Use: Medication usage by the patient (1: Yes, 0: No)
    - Stress Level: Stress level reported by the patient (1-10)
    - Sedentary Hours Per Day: Hours of sedentary activity per day
    - Income: The income level of the patient
    - BMI: Body Mass Index (BMI) of the patient
    - Triglycerides: Triglyceride levels of the patient
    - Physical Activity Days Per Week: Days of physical activity per week
    - Sleep Hours Per Day: Hours of sleep per day
    - Country: Country of the patient
    - Continent: Continent where the patient resides
    - Hemisphere: Hemisphere where the patient resides
    - Heart Attack Risk: Presence of heart attack risk (1: Yes, 0: No)

In [None]:
df.info()

In [None]:
def data_info(data):
  cols=data.columns
  unique_val=[data[col].value_counts().head(10).index.to_numpy() for col in cols]
  n_uniques=[data[col].nunique() for col in cols]
  dtypes=[data[col].dtype for col in cols]
  nulls=[data[col].isnull().sum() for col in cols]
  dup=[data .duplicated().sum() for col in cols]
  return pd.DataFrame({"Col": cols, "Dtypes":dtypes, "N Uniques":n_uniques, "Null":nulls, "Uniques":unique_val, "duplicated":dup})


df_info = data_info(df)
df_info

In [None]:
df.columns

In [None]:
# dropping the id feature because it's 100% unique
df.drop('Patient ID', axis=1, inplace=True)

### Exploratory Data Analysis

In [None]:
ctg_df = df[['Sex','Diet','Continent','Country','Hemisphere']]
plt.figure(figsize=(8, 7))
for i in ctg_df.columns:
    ctg_num = ctg_df[i].value_counts()
    chart = sns.barplot(x = ctg_num.index, y = ctg_num, palette='viridis')
    plt.title(i)
    plt.xlabel(i)
    plt.ylabel('Count')
    chart.set_xticklabels(chart.get_xticklabels(), rotation=90, )
    plt.show()

### Correlation between Variables using HeatMap

In [None]:
plt.figure(figsize = (19,10))
sns.heatmap(df[['Age','Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity',
      'Cholesterol','Alcohol Consumption', 'Exercise Hours Per Week',
     'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day','Heart Attack Risk']].corr(), cmap="YlGnBu",
            annot=True)

#### Key Findings

  - Heart Attack risk has highest correlation with Diabetes, Cholestrol and Exercise Hours Per Weak
  - Heart attack Risk is not much dependent on Sedentary Hours Per Day
  - Alcohol Consumption has no stronger link with Heart Attack Risk
  - Smoking is not a major cause of Heart Attack

In [None]:
#Continents w.r.t Cholestrol Level range
plt.figure(figsize= (9,4))
sns.set_theme(style="ticks", palette="pastel")
sns.boxplot(x="Continent", y='Cholesterol', hue="Sex",
             palette=["m", "g"], data=df)
plt.show()


In [None]:
#Continents w.r.t Exercise Hours Per Week range
plt.figure(figsize= (9,4))
sns.set_theme(style="ticks", palette="pastel")
sns.boxplot(x="Continent", y='Exercise Hours Per Week',  hue="Sex",
             palette=["b", "r"], data=df)

#check outliers

In [None]:
continuous_Features=["Age", "Cholesterol", "Heart Rate", "Exercise Hours Per Week",
                     "Sedentary Hours Per Day", "Income", "BMI", "Triglycerides"]

plt.figure(figsize=(15,15))
for column in range(len(continuous_Features)):
  plt.subplot(2,4,column+1)
  sns.boxenplot(df[continuous_Features[column]])
  plt.title(f"Box plot for {continuous_Features[column]}")
plt.show()

#Remove outliers

In [None]:
for col in continuous_Features:
  Q1=df[col].quantile(0.25)
  Q3=df[col].quantile(0.75)
  IQR=Q3-Q1
  lower_bound=Q1-1.5*IQR
  upper_bound=Q3+1.5*IQR
  lower_outliers=df[df[col]<lower_bound][col].values
  upper_outliers=df[df[col]>upper_bound][col].values
  df[col].replace(lower_outliers,lower_bound,inplace=True)
  df[col].replace(upper_outliers,upper_bound,inplace=True)

## FEATURE ENGINEERING

- Setting column 'Blood Pressure'
Splitting Between Diastolic and Systolic Blood Pressure

In [None]:
df['BP_Systolic'] = df['Blood Pressure'].apply(lambda x: x.split('/')[0])
df['BP_Diastolic'] = df['Blood Pressure'].apply(lambda x: x.split('/')[1])

- Ordinal Encoding for diet: 1:healthy, 2:average, 3:unhealthy

In [None]:
oe_encoder = OrdinalEncoder(categories=[["Unhealthy", "Average", "Healthy"]])
df['Diet'] = oe_encoder.fit_transform(df[['Diet']])

- Lable encoding for 'Sex(Gender)

In [None]:
la = LabelEncoder()
df['Sex'] = la.fit_transform(df['Sex'])

In [None]:
df.dtypes

- converting 'Object' Datatype into int

In [None]:
cat_columns = ['BP_Systolic','BP_Diastolic']
df[cat_columns] = df[cat_columns].astype(int)

In [None]:
df.dtypes

In [None]:
df.drop(['Blood Pressure', 'Hemisphere', 'Continent', 'Country'], axis=1, inplace=True)
df.head()

In [None]:
df.duplicated().sum()

In [None]:
x = df.drop('Heart Attack Risk', axis=1)
y = df['Heart Attack Risk']

In [None]:
x.head()

# spliting data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#scaling

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
model1 = GaussianNB()
model1.fit(x_train, y_train)

In [None]:
y_pred=model1.predict(x_test)
accuracy_score(y_test,y_pred)

In [None]:
model2 = DecisionTreeClassifier()
model2.fit(x_train, y_train)

In [None]:
y_pred=model2.predict(x_test)
accuracy_score(y_test,y_pred)

In [None]:
model3 = RandomForestClassifier()
model3.fit(x_train, y_train)

In [None]:
y_pred=model3.predict(x_test)
accuracy_score(y_test,y_pred)

In [None]:
model4 = KNeighborsClassifier()
model4.fit(x_train, y_train)

In [None]:
y_pred=model4.predict(x_test)
accuracy_score(y_test,y_pred)

- Tuning parameter

In [None]:
Knn=KNeighborsClassifier()
param_grid={
    "n_neighbors":[100,1000],
    "p":[1,2]
}
grid_search=GridSearchCV(Knn,param_grid,cv=5)
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
best_knn=grid_search.best_estimator_
y_pred=best_knn.predict(x_test)
accuracy_score(y_test,y_pred)