In [1]:
## ==> Imports

# %pip install kagglehub
import kagglehub
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset",)

print(f"Dataset downloaded to: {path}")


  from .autonotebook import tqdm as notebook_tqdm


Dataset downloaded to: C:\Users\saadz\.cache\kagglehub\datasets\johnsmith88\heart-disease-dataset\versions\2


In [2]:
## ==> Data Collection and Processing

heart_data = pd.read_csv(path + "/heart.csv")
print(f"Data shape: {heart_data.shape}")
heart_data.tail()


Data shape: (1025, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1
1024,54,1,0,120,188,0,1,113,0,1.4,1,1,3,0


In [3]:
heart_data.info()
heart_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [4]:
heart_data['target'].value_counts()

target
1    526
0    499
Name: count, dtype: int64

## 1-->Heart Disease Patients
## 0 --> No Heart Disease Patients

In [5]:
## ==> Train Test Split

X = heart_data.drop('target', axis=1)
y = heart_data['target']
print(X)


      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0      52    1   0       125   212    0        1      168      0      1.0   
1      53    1   0       140   203    1        0      155      1      3.1   
2      70    1   0       145   174    0        1      125      1      2.6   
3      61    1   0       148   203    0        1      161      0      0.0   
4      62    0   0       138   294    1        1      106      0      1.9   
...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
1020   59    1   1       140   221    0        1      164      1      0.0   
1021   60    1   0       125   258    0        0      141      1      2.8   
1022   47    1   0       110   275    0        0      118      1      1.0   
1023   50    0   0       110   254    0        0      159      0      0.0   
1024   54    1   0       120   188    0        1      113      0      1.4   

      slope  ca  thal  
0         2   2     3  
1         0   0     3  
2  

In [6]:
print(y)

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify= y, random_state=42)


In [8]:
print(X_train.shape, X_test.shape, X.shape)


(820, 13) (205, 13) (1025, 13)


In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [10]:
X_train_predictions = model.predict(X_train)
training_accuracy = accuracy_score(y_train, X_train_predictions)
print(f"Training accuracy: {training_accuracy:.5f}")

Training accuracy: 0.84878


In [11]:
X_test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, X_test_predictions)
print(f"Test accuracy: {test_accuracy:.5f}")


Test accuracy: 0.81463


## Prediction System

In [12]:
input_data = (62, 0, 0, 140, 268, 0, 0, 160, 0, 3.6, 0, 2, 2)

# Convert input data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = model.predict(input_data_reshaped)
# print(f"Prediction: {prediction[0]}")

if prediction[0] == 1:
    print("The person is likely to have heart disease.")
else:
    print("The person is unlikely to have heart disease.")



The person is unlikely to have heart disease.
