Importing Dependencies

In [20]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np 
from sklearn.metrics import accuracy_score

Data Collection and Processing

In [21]:
heart_data = pd.read_csv('heart_disease_data.csv')

In [22]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [23]:
heart_data.shape

(303, 14)

In [24]:
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [25]:
heart_data.duplicated().sum()

np.int64(1)

In [26]:
heart_data = heart_data.drop_duplicates()

In [27]:
heart_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,0.963576,131.602649,246.5,0.149007,0.52649,149.569536,0.327815,1.043046,1.397351,0.718543,2.31457,0.543046
std,9.04797,0.466426,1.032044,17.563394,51.753489,0.356686,0.526027,22.903527,0.470196,1.161452,0.616274,1.006748,0.613026,0.49897
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [28]:
heart_data['target'].value_counts()
#164 have a heart disease
#138 have no heart disease
# 0 = no heart disease, 1 = heart disease

target
1    164
0    138
Name: count, dtype: int64

Splitting the Features and Target 

In [29]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

Splitting Data into Training and Test Data

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [31]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((241, 13), (241,), (61, 13), (61,))

Model Training

Logistic Regression

In [32]:
model = LogisticRegression()

In [33]:
#Training the model 
model.fit(X_train, Y_train) #It finds the relationship between the features and the target variable

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [34]:
#Accuracy of training data 
X_train_prediction = model.predict(X_train) #Predict the target variable using the training data
training_data_accuracy = accuracy_score(X_train_prediction, Y_train) #Compare the predicted target variable with the actual target variable
print('Accuracy of training data : ', training_data_accuracy) #Print the accuracy of the training data


Accuracy of training data :  0.8506224066390041


In [35]:
#Accuracy on test data 
X_test_prediction = model.predict(X_test) #Predict the target variable using the test data
test_data_accuracy = accuracy_score(X_test_prediction, Y_test) #Compare the predicted target variable with the actual target variable
print('Accuracy of test data : ', test_data_accuracy) #Print the accuracy of the test data

Accuracy of test data :  0.819672131147541


Building a Predictive System

In [None]:
input_data = (41,0,1,130,204,0,0,172,0,1.4,2,0,2)

#Change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

#Reshape the numpy array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1) #This reshapes the array to 1 row and 13 columns, -1 does not change the number of columns, it just makes it 1 row

prediction = model.predict(input_data_reshaped) #Predict the target variable using the reshaped input data

if prediction[0] == 0: #If the prediction is 0, it means no heart disease
    print('The person does not have heart disease')
else:
    print('The person has heart disease')

The person has heart disease


