In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('heart_disease_dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,0,75,Female,228,119,66,Current,Heavy,1,No,No,Yes,8,119,Yes,Atypical Angina,Yes
1,1,48,Male,204,165,62,Current,,5,No,No,No,9,70,Yes,Typical Angina,No
2,2,53,Male,234,91,67,Never,Heavy,3,Yes,No,Yes,5,196,Yes,Atypical Angina,Yes
3,3,69,Female,192,90,72,Current,,4,No,Yes,No,7,107,Yes,Non-anginal Pain,No
4,4,62,Female,172,163,93,Never,,6,No,Yes,No,2,183,Yes,Asymptomatic,No


In [4]:
df.isnull().sum()

Unnamed: 0                   0
Age                          0
Gender                       0
Cholesterol                  0
Blood Pressure               0
Heart Rate                   0
Smoking                      0
Alcohol Intake             340
Exercise Hours               0
Family History               0
Diabetes                     0
Obesity                      0
Stress Level                 0
Blood Sugar                  0
Exercise Induced Angina      0
Chest Pain Type              0
Heart Disease                0
dtype: int64

In [5]:
df.shape

(1000, 17)

In [6]:
# finding most frequent value to replace Alcohol with
alcohol_intake_mode = df['Alcohol Intake'].mode()[0]
alcohol_intake_mode

'Heavy'

In [7]:
# replaceing most frequent value to NaN
df['Alcohol Intake'] = df['Alcohol Intake'].fillna(alcohol_intake_mode)

In [8]:
df.isnull().sum() # all NaN values are replaced

Unnamed: 0                 0
Age                        0
Gender                     0
Cholesterol                0
Blood Pressure             0
Heart Rate                 0
Smoking                    0
Alcohol Intake             0
Exercise Hours             0
Family History             0
Diabetes                   0
Obesity                    0
Stress Level               0
Blood Sugar                0
Exercise Induced Angina    0
Chest Pain Type            0
Heart Disease              0
dtype: int64

<h3>Train Test Split</h3>

In [9]:
df.drop(columns=['Unnamed: 0'], inplace=True) # droping index column

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Heart Disease'), df['Heart Disease'], test_size=0.1)

In [11]:
X_train.shape, X_test.shape

((900, 15), (100, 15))

<h3>Encoding Categorical Data</h3>

In [12]:
df.head(3)

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,Female,228,119,66,Current,Heavy,1,No,No,Yes,8,119,Yes,Atypical Angina,Yes
1,48,Male,204,165,62,Current,Heavy,5,No,No,No,9,70,Yes,Typical Angina,No
2,53,Male,234,91,67,Never,Heavy,3,Yes,No,Yes,5,196,Yes,Atypical Angina,Yes


In [13]:
df.groupby('Chest Pain Type').size()

Chest Pain Type
Asymptomatic        248
Atypical Angina     246
Non-anginal Pain    256
Typical Angina      250
dtype: int64

In [14]:
# using column transformer for all encoding

transformer = ColumnTransformer([
    ('ord1', OrdinalEncoder(categories=[['Never', 'Former', 'Current'], ['Moderate', 'Heavy'], ['Asymptomatic', 'Non-anginal Pain', 'Atypical Angina', 'Typical Angina']]),
                            ['Smoking', 'Alcohol Intake', 'Chest Pain Type']),
    # Asymptomatic (no symptoms)
    # Non-anginal Pain (pain not related to angina)
    # Atypical Angina (atypical or less common form of angina)
    # Typical Angina (most common form of angina)

    ('ohe1', OneHotEncoder(drop='first', sparse_output=False), ['Gender', 'Family History', 'Diabetes', 'Obesity', 'Exercise Induced Angina']),
], remainder='passthrough',
   force_int_remainder_cols=False)

In [15]:
# applying label Encoder on y_train

le = LabelEncoder()

y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

In [16]:
y_train_transformed.shape

(900,)

In [17]:
# applying label encoding
le = LabelEncoder()

y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

In [18]:
y_train_transformed.shape, y_test_transformed.shape

((900,), (100,))

<h3>Training Model Using Pipeline</h3>

In [19]:
pipe = Pipeline(steps=[
    ('preprocessing', transformer),
    ('scaling', StandardScaler()),
    ('training', LogisticRegression())
])

In [20]:
pipe.fit(X_train, y_train_transformed)

In [21]:
# predicting from the test case
y_pred = pipe.predict(X_test)

y_pred

array([1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0])

<h3>Finding Accuracy of Model</h3>

In [22]:
print(f"The Accuracy of Model is: {np.around(accuracy_score(y_test_transformed, y_pred) * 100, 2)}%")

The Accuracy of Model is: 85.0%


<h3>Finding Cross Validation Score of Model</h3>

In [23]:
print(f"The Cross Validation Score of Model is: {np.around(cross_val_score(estimator=pipe, X=X_train, y=y_train, cv=10, scoring='accuracy').mean() * 100, 2)}%")

The Cross Validation Score of Model is: 86.78%
