Importing the dependencies

In [69]:
import numpy as np #used to create numpy arrays
import pandas as pd #used to create dataframes i.e. structured tables
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split #divide the data into training data and testing data
from sklearn.preprocessing import StandardScaler #helps to standardize values to a common range
from sklearn.metrics import mean_squared_error , accuracy_score # to evaluate how good our model is 
# mean_squared_error gives the average of the square of the difference between the observed and predicted values of a variable.
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

Data collection and pre-processing


In [47]:
data = pd.read_csv('heart.csv')

In [48]:
data.head() #print first 5 rows of the dataset
# data.head(10) #prints first 10 rows of the dataset

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [49]:
data.tail() #print last 5 rows of the dataset
# data.tail(10) #prints last 10 rows

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [50]:
data.shape #tells the size of our dataset

(303, 14)

In [51]:
data.info() #used to print the info about the columns of the dataset where non-null tells that there are no null values in the column and dtype tells the data-type of that particular column.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [52]:
#another method to find out the number of null values in a column.
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [53]:
#gives the statistical measures about the data
#25% tells that 25% of values in the age column are less than the 47.5
#mean is the mean of all the data
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [54]:
#gives the value count on the basis of values in the given column
data['target'].value_counts()

target
1    165
0    138
Name: count, dtype: int64

Splitting the data into train and test

In [55]:
#features
# X contains all the columns except the target column.
X = data.drop(columns = 'target' , axis = 1) #axis is 1 while dropping a column and axis is 0 while dropping a row.
#target
# Y contains only the target column
Y = data['target']

In [56]:
print(X)
print(Y)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak   
0     63    1   3       145   233    1        0      150      0      2.3  \
1     37    1   2       130   250    0        1      187      0      3.5   
2     41    0   1       130   204    0        0      172      0      1.4   
3     56    1   1       120   236    0        1      178      0      0.8   
4     57    0   0       120   354    0        1      163      1      0.6   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  
0        0   0     1  
1        0   0     2  
2        2   0    

In [57]:
#X_train contains all the train data whereas X_test contains all the test data.
#Y_train contains the target data for training data and Y_test contains target for the test data.
#test_size tells that the test data is 20% of all the data hence train_data is 80%
#stratify tells that the two values i.e. 0 and 1 must be evenly spread across the training data and test data.
# random_state tells that ones data will be split in the same way other one's data is splitted who uses random_state as 2
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [58]:
print(X.shape , X_train.shape , X_test.shape)

(303, 13) (242, 13) (61, 13)


In [59]:
scaler = StandardScaler()

In [60]:
# fitting the data to the standardScaler function
# standardize the data using this X_train and transform the data based on this scalar
# when we are fitting this X_train to a scalar it will transform all the values of this X_train to a common range
scaler.fit(X_train)

In [61]:
# now since we have fitted the data and now need to transform the data (converts all the values to a same range)
X_train = scaler.transform(X_train)
# transforming the X_test data
# fit only the X_train data and on that basis transform the X_test data 
# becoz we don't want to show the test to our ml model in prior.  
X_test = scaler.transform(X_test)

In [62]:
print(X_train)

[[-0.04180248  0.69617712  0.04467671 ...  0.96628239 -0.69876652
   1.18825929]
 [ 2.48724773  0.69617712 -0.93821081 ...  0.96628239  2.28537756
  -0.50326276]
 [ 0.17811493  0.69617712  0.04467671 ... -2.30421185 -0.69876652
  -0.50326276]
 ...
 [ 1.49761939  0.69617712 -0.93821081 ... -0.66896473  1.29066287
   1.18825929]
 [ 0.61794975  0.69617712  1.02756422 ... -0.66896473 -0.69876652
  -0.50326276]
 [-0.59159601 -1.43641607  0.04467671 ... -0.66896473 -0.69876652
  -0.50326276]]


In [63]:
model = RandomForestClassifier()
model.fit(X_train , Y_train)
X_test_prediction = model.predict(X_test)
print(accuracy_score(X_test_prediction , Y_test))

0.7868852459016393


In [64]:
model2 = SVC()
model2.fit(X_train , Y_train)
X_test_prediction2 = model2.predict(X_test)
print(accuracy_score(X_test_prediction2 , Y_test))

0.8032786885245902


In [65]:
model3 = GaussianNB()
model3.fit(X_train , Y_train)
X_test_prediction3 = model3.predict(X_test)
print(accuracy_score(X_test_prediction3 , Y_test))

0.819672131147541


In [72]:
model4 = LogisticRegression()
model4.fit(X_train , Y_train)
X_test_prediction4 = model4.predict(X_test)
print(accuracy_score(X_test_prediction4 , Y_test))

0.7868852459016393


In [74]:
import pickle

# Save the trained model as a pickle string.

filename = 'heart.pkl'

pickle.dump(model2,open(filename,'wb'))


In [85]:
input_data = (62,0,0,140,268,0,0,160,0,3.6,0,2,2)


# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model2.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

[1]
The Person has Heart Disease
