## Data Processing

In [1]:
import numpy as np
import pandas as pd

### Reading Dataset

In [20]:
ds = pd.read_csv('Heart.csv')
ds

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,299,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
299,300,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
300,301,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes
301,302,57,0,nontypical,130,236,0,2,174,0,0.0,2,1.0,normal,Yes


### Shape and Dtypes of Data

In [14]:
ds.shape

(297, 15)

In [15]:
ds.dtypes

Unnamed: 0      int64
Age             int64
Sex             int64
ChestPain      object
RestBP          int64
Chol            int64
Fbs             int64
RestECG         int64
MaxHR           int64
ExAng           int64
Oldpeak       float64
Slope           int64
Ca            float64
Thal           object
AHD            object
dtype: object

### Counting and Removing Null Values

In [24]:
ds[ds==0].count().sum()

985

In [22]:
ds.isnull().sum()

Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
AHD           0
dtype: int64

In [7]:
ds = ds.dropna(how = 'any')

In [8]:
ds.isnull().sum()

Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            0
Thal          0
AHD           0
dtype: int64

#### Note : Null values can also be replaced instead of removing them from the dataset by using ds.fillna() 

### Calculation of mean values

In [12]:
MeanAge = ds['Age'].mean()
print("Mean Age : ", MeanAge)

Mean Age :  54.54208754208754


In [30]:
x = ds[['Age','Sex','RestBP','Chol']]
y = ds['AHD']

In [31]:
x

Unnamed: 0,Age,Sex,RestBP,Chol
0,63,1,145,233
1,67,1,160,286
2,67,1,120,229
3,37,1,130,250
4,41,0,130,204
...,...,...,...,...
298,45,1,110,264
299,68,1,144,193
300,57,1,130,131
301,57,0,130,236


In [32]:
y

0       No
1      Yes
2      Yes
3       No
4       No
      ... 
298    Yes
299    Yes
300    Yes
301    Yes
302     No
Name: AHD, Length: 303, dtype: object

In [33]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25 ,random_state = 42)

In [34]:
x_train

Unnamed: 0,Age,Sex,RestBP,Chol
287,58,1,125,220
282,55,0,128,205
197,45,0,138,236
158,60,1,140,293
164,48,1,124,255
...,...,...,...,...
188,54,1,192,283
71,67,1,125,254
106,59,1,140,177
270,61,1,140,207


In [35]:
y_train

287     No
282    Yes
197     No
158    Yes
164     No
      ... 
188    Yes
71     Yes
106    Yes
270    Yes
102     No
Name: AHD, Length: 227, dtype: object

In [36]:
x_test

Unnamed: 0,Age,Sex,RestBP,Chol
179,53,1,130,246
228,54,1,110,206
111,56,1,125,249
246,58,1,100,234
60,51,0,130,305
...,...,...,...,...
22,58,1,120,284
258,70,1,156,245
56,50,1,140,233
242,49,0,130,269


In [37]:
y_test

179     No
228    Yes
111    Yes
246    Yes
60     Yes
      ... 
22     Yes
258     No
56     Yes
242     No
114    Yes
Name: AHD, Length: 76, dtype: object