# Importing Dependencies

In [4]:
import numpy as np
import pandas as pd

# For Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To create Pipeline
from sklearn.pipeline import Pipeline, make_pipeline

# To create a function Transformer
from sklearn.preprocessing import FunctionTransformer

# To create a Column Transformer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector

# For Missing Values
from sklearn.impute import SimpleImputer

# Getting the recall score on out train set
from sklearn.metrics import recall_score

# Getting the accuracy score on our train set
from sklearn.metrics import accuracy_score

## Getting the accuracy score on train set
from sklearn.metrics import accuracy_score

## Getting the classification report from our train set
from sklearn.metrics import classification_report

## Gridsearch CV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

## Loading the Dataset

In [6]:
df=pd.read_csv('Data/Cardio_Vascular.csv')

In [7]:
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


## Description of the Dataset

In [10]:
import pandas as pd

data = {
    "Attribute": [
        "General_Health",
        "Checkup",
        "Exercise",
        "Heart_Disease",
        "Skin_Cancer",
        "Other_Cancer",
        "Depression",
        "Diabetes",
        "Arthritis",
        "Sex",
        "Age_Category",
        "Height_(cm)",
        "Weight_(kg)",
        "BMI",
        "Smoking_History",
        "Alcohol_Consumption",
        "Fruit_Consumption",
        "Green_Vegetables_Consumption",
        "FriedPotato_Consumption"
    ],
    "Description": [
        "Self-reported general health status",
        "Time since last routine checkup visit to a doctor",
        "Participation in physical activities",
        "Reporting coronary heart disease or infarction",
        "Reporting having skin cancer",
        "Reporting having other types of cancer",
        "Reporting having a depressive disorder",
        "Reporting having diabetes and its type",
        "Reporting having arthritis",
        "Gender of the respondent",
        "Age category of the respondent",
        "Height of the respondent (cm)",
        "Weight of the respondent (kg)",
        "Body Mass Index (calculated)",
        "Smoking history information",
        "Alcohol consumption information",
        "Fruit consumption information",
        "Consumption of green vegetables",
        "Consumption of fried potatoes"
    ]
}

pd.DataFrame(data)


Unnamed: 0,Attribute,Description
0,General_Health,Self-reported general health status
1,Checkup,Time since last routine checkup visit to a doctor
2,Exercise,Participation in physical activities
3,Heart_Disease,Reporting coronary heart disease or infarction
4,Skin_Cancer,Reporting having skin cancer
5,Other_Cancer,Reporting having other types of cancer
6,Depression,Reporting having a depressive disorder
7,Diabetes,Reporting having diabetes and its type
8,Arthritis,Reporting having arthritis
9,Sex,Gender of the respondent


In [11]:
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


## Data Understanding

In [12]:
#Setting up the Target Variable
target='Heart_Disease'

In [14]:
## Creating Numerical and Categorical Columns
numerical=df.select_dtypes(include=['float64']).columns.sort_values()
categorical=df.select_dtypes(include=['object']).columns.sort_values()

In [15]:
print(f'There are {len(numerical)} Numerical Columns')
print(f'There are {len(categorical) } Categorical Columns')

There are 7 Numerical Columns
There are 12 Categorical Columns


In [18]:
df.describe()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
count,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0
mean,170.615249,83.588655,28.626211,5.096366,29.8352,15.110441,6.296616
std,10.658026,21.34321,6.522323,8.199763,24.875735,14.926238,8.582954
min,91.0,24.95,12.02,0.0,0.0,0.0,0.0
25%,163.0,68.04,24.21,0.0,12.0,4.0,2.0
50%,170.0,81.65,27.44,1.0,30.0,12.0,4.0
75%,178.0,95.25,31.85,6.0,30.0,20.0,8.0
max,241.0,293.02,99.33,30.0,120.0,128.0,128.0


In [19]:
df.describe(include=object)

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Smoking_History
count,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854
unique,5,5,2,2,2,2,2,4,2,2,13,2
top,Very Good,Within the past year,Yes,No,No,No,No,No,No,Female,65-69,No
freq,110395,239371,239381,283883,278860,278976,246953,259141,207783,160196,33434,183590


## Let's Proceed with the basic EDA

## Feature Engineering

In [20]:
df['Heart_Disease']=df['Heart_Disease'].map({'Yes':1,'No':0})
df['Heart_Disease'].value_counts()

0    283883
1     24971
Name: Heart_Disease, dtype: int64

## Splitting the Dataset

In [21]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(df,test_size=0.2,random_state=22,stratify=df['Heart_Disease'])
print(train.shape)
print(test.shape)

(247083, 19)
(61771, 19)


In [23]:
yes = train['Heart_Disease'].value_counts()[0]/len(train['Heart_Disease'])*100
no = train['Heart_Disease'].value_counts()[1]/len(train['Heart_Disease'])*100
print('Train Set')
print(f'ratio of people with heart disease to total is {yes}')
print(f'ratio of people that dont have heart disease to total is {no}')
print('')

yes = test['Heart_Disease'].value_counts()[0]/len(test['Heart_Disease'])*100
no = test['Heart_Disease'].value_counts()[1]/len(test['Heart_Disease'])*100
print('Test Set')
print(f'ratio of people with heart disease to total is {yes}')
print(f'ratio of people that dont have heart disease to total is {no}')

Train Set
ratio of people with heart disease to total is 91.91486261701533
ratio of people that dont have heart disease to total is 8.085137382984666

Test Set
ratio of people with heart disease to total is 91.91530005989866
ratio of people that dont have heart disease to total is 8.084699940101341


In [24]:
## Splitting the X and y variables in the train set
X_train = train.drop("Heart_Disease", axis=1)
y_train = train["Heart_Disease"].copy()

## Splitting the X and y variables in the test set
X_test = test.drop("Heart_Disease", axis=1)
y_test = test["Heart_Disease"].copy()

## Creating Pipelines

In [26]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline=make_pipeline(OneHotEncoder(handle_unknown='ignore',drop='first'))

In [27]:
from sklearn.preprocessing import StandardScaler

num_pipeline=make_pipeline(FunctionTransformer(np.log1p,feature_names_out='one-to-one'),StandardScaler())

In [28]:
from sklearn.preprocessing import OrdinalEncoder

agecat_pipeline=make_pipeline(OrdinalEncoder())

genhealth_pipeline=make_pipeline(OrdinalEncoder(categories=[['Poor','Fair','Good','Very Good','Excellent']]))

checkup_pipeline = make_pipeline(OrdinalEncoder(categories=[['Within the past year','Within the past 2 years','Within the past 5 years','5 or more years ago','Never']]))

In [30]:
## Setting each column to the pipeline where they will be used
num_pipe_col = numerical

cat_pipe_col = ['Arthritis', 'Depression', 'Diabetes',
       'Exercise', 'Other_Cancer', 'Sex',
       'Skin_Cancer', 'Smoking_History']

### Finalizing the preprocessing pipeline

In [34]:
## Combining all the pipelines and creating a main pipeline to enter all the data
preprocessor = ColumnTransformer([
    ('Categorical', cat_pipeline,   cat_pipe_col),
    ('Age_Category',agecat_pipeline,['Age_Category']),
    ('Checkup',checkup_pipeline,['Checkup']),
    ('Gen_health',genhealth_pipeline,['General_Health']),
    ('Numerical',   num_pipeline,  num_pipe_col),
],remainder='passthrough')
preprocessor

In [35]:
## Using preprocessing pipeline
print(f'Shape before the preprocessing: {X_train.shape}')
X_train_preprocessed=preprocessing.fit_transform(X_train)
print(f'Shape after the preprocessing: {X_train_preprocessed.shape}')

Shape before the preprocessing: (247083, 18)
Shape after the preprocessing: (247083, 20)


In [36]:
# Create pipleine with preprocessing and model
pipeline=Pipeline(steps=[('Preprocessor',preprocessor),('model',RandomForestClassifier())])

In [39]:
# Fit pipeline on training data
pipeline.fit(X_train, y_train)

In [40]:
# Predict on test data
y_pred = pipeline.predict(X_test)

In [41]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.92
