Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

Data Collection and Processing

In [2]:
# Loading the data into a Pandas dataframe
heart_data = pd.read_csv('/content/cvd_train.csv')

In [3]:
# Print first 5 rows of dataset
heart_data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
# Print last 5 rows of the dataset
heart_data.tail()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1
69999,99999,20540,1,170,72.0,120,80,2,1,0,0,1,0


In [5]:
# Number of rows and columns in dataset
heart_data.shape

(70000, 13)

In [6]:
# Some more information about the dataset
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [7]:
# Checking for missing values
heart_data.isnull().sum()

Unnamed: 0,0
id,0
age,0
gender,0
height,0
weight,0
ap_hi,0
ap_lo,0
cholesterol,0
gluc,0
smoke,0


In [8]:
# Checking for duplicate values
heart_data.duplicated().sum()

np.int64(0)

Removing Outliers for BP

In [9]:
heart_data.insert(5, 'bmi', round((heart_data['weight']/(heart_data['height']/100)**2), 2))
heart_data = heart_data.drop(heart_data.query('ap_hi>220 or ap_lo>180 or ap_hi<40 or ap_lo<40').index, axis=0)
heart_data = heart_data.drop(heart_data.query('ap_hi<ap_lo').index, axis=0)

In [10]:
heart_data['age'] = round(heart_data['age']/365.25,2)
heart_data = heart_data.drop(heart_data.query('age<30 or age>60').index, axis=0)

In [11]:
heart_data

Unnamed: 0,id,age,gender,height,weight,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50.36,2,168,62.0,21.97,110,80,1,1,0,0,1,0
1,1,55.38,1,156,85.0,34.93,140,90,3,1,0,0,1,1
2,2,51.63,1,165,64.0,23.51,130,70,3,1,0,0,0,1
3,3,48.25,2,169,82.0,28.71,150,100,1,1,0,0,1,1
4,4,47.84,1,156,56.0,23.01,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69993,99991,53.93,1,172,70.0,23.66,130,90,1,1,0,0,1,1
69994,99992,57.70,1,165,80.0,29.38,150,80,1,1,0,0,1,1
69995,99993,52.68,2,168,76.0,26.93,120,80,1,1,1,0,1,0
69997,99996,52.20,2,183,105.0,31.35,180,90,3,1,0,1,0,1


In [12]:
# Statistical measures of dataset
heart_data.describe()

Unnamed: 0,id,age,gender,height,weight,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0
mean,49972.222319,51.291317,1.346267,164.563371,73.970531,27.395412,125.668292,80.93614,1.332366,1.21002,0.091626,0.055833,0.80493,0.456367
std,28865.40711,5.769708,0.475784,8.139999,14.373405,6.053078,16.441005,9.445798,0.648131,0.551714,0.2885,0.229602,0.396258,0.498097
min,0.0,30.02,1.0,57.0,11.0,3.47,60.0,40.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,24941.0,47.37,1.0,159.0,65.0,23.81,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50021.0,52.06,1.0,165.0,72.0,26.22,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74861.0,55.99,2.0,170.0,82.0,30.06,135.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,60.0,2.0,250.0,200.0,298.67,220.0,170.0,3.0,3.0,1.0,1.0,1.0,1.0


In [13]:
# Checking the distribution of Target variable (cardio)
heart_data['cardio'].value_counts()

Unnamed: 0_level_0,count
cardio,Unnamed: 1_level_1
0,30544
1,25641


0 represents Healthy Heart

1 represents Defective Heart



Splitting the Features and Target

In [14]:
X = heart_data.drop(columns=['cardio','id'],axis=1)
Y = heart_data['cardio']

In [15]:
print(X)
X.describe()

         age  gender  height  weight    bmi  ap_hi  ap_lo  cholesterol  gluc  \
0      50.36       2     168    62.0  21.97    110     80            1     1   
1      55.38       1     156    85.0  34.93    140     90            3     1   
2      51.63       1     165    64.0  23.51    130     70            3     1   
3      48.25       2     169    82.0  28.71    150    100            1     1   
4      47.84       1     156    56.0  23.01    100     60            1     1   
...      ...     ...     ...     ...    ...    ...    ...          ...   ...   
69993  53.93       1     172    70.0  23.66    130     90            1     1   
69994  57.70       1     165    80.0  29.38    150     80            1     1   
69995  52.68       2     168    76.0  26.93    120     80            1     1   
69997  52.20       2     183   105.0  31.35    180     90            3     1   
69999  56.24       1     170    72.0  24.91    120     80            2     1   

       smoke  alco  active  
0         

Unnamed: 0,age,gender,height,weight,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
count,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0,56185.0
mean,51.291317,1.346267,164.563371,73.970531,27.395412,125.668292,80.93614,1.332366,1.21002,0.091626,0.055833,0.80493
std,5.769708,0.475784,8.139999,14.373405,6.053078,16.441005,9.445798,0.648131,0.551714,0.2885,0.229602,0.396258
min,30.02,1.0,57.0,11.0,3.47,60.0,40.0,1.0,1.0,0.0,0.0,0.0
25%,47.37,1.0,159.0,65.0,23.81,120.0,80.0,1.0,1.0,0.0,0.0,1.0
50%,52.06,1.0,165.0,72.0,26.22,120.0,80.0,1.0,1.0,0.0,0.0,1.0
75%,55.99,2.0,170.0,82.0,30.06,135.0,90.0,1.0,1.0,0.0,0.0,1.0
max,60.0,2.0,250.0,200.0,298.67,220.0,170.0,3.0,3.0,1.0,1.0,1.0


In [16]:
print(Y)

0        0
1        1
2        1
3        1
4        0
        ..
69993    1
69994    1
69995    0
69997    1
69999    0
Name: cardio, Length: 56185, dtype: int64


Splitting data into Training set and testing set

In [17]:
from sklearn.preprocessing import StandardScaler
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,train_size=0.8, stratify=Y)
numeric=['age', 'gender', 'height', 'weight', 'bmi', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc']
sc=StandardScaler()
X_train[numeric]=sc.fit_transform(X_train[numeric])
X_test[numeric]=sc.transform(X_test[numeric])

In [18]:
print(X.shape, X_train.shape, X_test.shape)

(56185, 12) (44948, 12) (11237, 12)


Model Training

Logistic Regression

In [19]:
model=LogisticRegression()

In [20]:
# Training the model
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [21]:
# Accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [22]:
print('Accuracy on training data : ', training_data_accuracy)

Accuracy on training data :  0.7345599359259589


In [23]:
# Accuracy on testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [24]:
print('Accuracy on testing data : ', test_data_accuracy)

Accuracy on testing data :  0.7421909762392097


Building a predictive system

In [25]:

input_data = (55.38,1,156,85,34.93,140,90,3,1,0,0,1)

# Changing data to numpy array
input_data_arr = np.asarray(input_data)

# Reshaping the numpy array for only one instance
input_data_reshaped = input_data_arr.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0]==0):
  print('Person does not have a Heart Disease')
else:
  print('Person has Heart Disease')

[1]
Person has Heart Disease


