In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.shape

(918, 12)

In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


### *Treating outliers with 3-Sigma Rule*

In [5]:
df.shape

(918, 12)

#### *1. Cholesterol*

In [6]:
df[df['Cholesterol']>df['Cholesterol'].mean()+(3*df['Cholesterol'].std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
76,32,M,ASY,118,529,0,Normal,130,N,0.0,Flat,1
149,54,M,ASY,130,603,1,Normal,125,Y,1.0,Flat,1
616,67,F,NAP,115,564,0,LVH,160,N,1.6,Flat,0


In [7]:
df1 = df[df['Cholesterol']<=df['Cholesterol'].mean()+(3*df['Cholesterol'].std())]
df1.shape

(915, 12)

#### *2. RestingBP*

In [8]:
df[df['RestingBP']>df['RestingBP'].mean()+(3*df['RestingBP'].std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
109,39,M,ATA,190,241,0,Normal,106,N,0.0,Up,0
241,54,M,ASY,200,198,0,Normal,142,Y,2.0,Flat,1
365,64,F,ASY,200,0,0,Normal,140,Y,1.0,Flat,1
399,61,M,NAP,200,0,1,ST,70,N,0.0,Flat,1
592,61,M,ASY,190,287,1,LVH,150,Y,2.0,Down,1
732,56,F,ASY,200,288,1,LVH,133,Y,4.0,Down,1
759,54,M,ATA,192,283,0,LVH,195,N,0.0,Up,1


In [9]:
df2 = df1[df1['RestingBP']<=df1['RestingBP'].mean()+(3*df1['RestingBP'].std())]

In [10]:
df2.shape

(908, 12)

#### *3. FastingBS*

In [11]:
df[df['FastingBS']>df['FastingBS'].mean()+(3*df['FastingBS'].std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


#### *4. MaxHR*

In [12]:
df[df['MaxHR']>df['MaxHR'].mean()+(3*df['MaxHR'].std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


#### *5. Oldpeak*

In [13]:
df[df['Oldpeak']>df['Oldpeak'].mean()+(3*df['Oldpeak'].std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1
702,59,M,TA,178,270,0,LVH,145,N,4.2,Down,0
771,55,M,ASY,140,217,0,Normal,111,Y,5.6,Down,1
791,51,M,ASY,140,298,0,Normal,122,Y,4.2,Flat,1
850,62,F,ASY,160,164,0,LVH,145,N,6.2,Down,1
900,58,M,ASY,114,318,0,ST,140,N,4.4,Down,1


In [14]:
df3 = df2[df2['Oldpeak']<=df2['Oldpeak'].mean()+(3*df2['Oldpeak'].std())]

In [15]:
df3.shape

(902, 12)

### *Handling string values*

In [17]:
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

#### *Sex, ChestPainType, RestingECG, ExerciseAngina, ST_Slope object needs to be handled*   

In [18]:
df.Sex.unique()

array(['M', 'F'], dtype=object)

In [19]:
df.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [20]:
df.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [21]:
df.ExerciseAngina.unique()

array(['N', 'Y'], dtype=object)

In [22]:
df.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [23]:
df4 = df3.copy()
df4.Sex.replace(
    {
        'F': 0,
        'M': 1
    },
    inplace=True
)
df4.ChestPainType.replace(
    {
        'ATA': 1,
        'NAP': 2,
        'ASY': 3,
        'TA': 4
    },
    inplace=True
)
df4.RestingECG.replace(
    {
        'Normal': 1,
        'ST': 2,
        'LVH': 3
    },
    inplace=True
)
df4.ExerciseAngina.replace(
    {
        'N': 0,
        'Y': 1
    },
    inplace=True
)
df4.ST_Slope.replace(
    {
        'Down': 1,
        'Flat': 2,
        'Up': 3
    },
    inplace=True
)
df4.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,3,0
1,49,0,2,160,180,0,1,156,0,1.0,2,1
2,37,1,1,130,283,0,2,98,0,0.0,3,0
3,48,0,3,138,214,0,1,108,1,1.5,2,1
4,54,1,2,150,195,0,1,122,0,0.0,3,0


In [24]:
df4.shape

(902, 12)

### *Handling string values with dummies* 

In [25]:
dataframe = pd.get_dummies(df, drop_first=True)
dataframe.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,True,True,False,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,True,True,False,False,False,True,False,False,True
3,48,138,214,0,108,1.5,1,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,0,True,False,True,False,True,False,False,False,True


In [26]:
dataframe.shape

(918, 16)

In [27]:
inputs = dataframe.drop('HeartDisease',axis=1)
outputs = dataframe.HeartDisease

### *Input Output and scalling*

In [28]:
X = df4.drop('HeartDisease',axis=1)
y = df4.HeartDisease

In [29]:
X.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289,0,1,172,0,0.0,3
1,49,0,2,160,180,0,1,156,0,1.0,2
2,37,1,1,130,283,0,2,98,0,0.0,3
3,48,0,3,138,214,0,1,108,1,1.5,2
4,54,1,2,150,195,0,1,122,0,0.0,3


In [30]:
y.head()

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled[:3]

array([[-1.42896269,  0.51485643, -1.6990351 ,  0.46089071,  0.85238015,
        -0.55087949, -0.74461081,  1.38191324, -0.82065181, -0.84676261,
         1.0456339 ],
       [-0.47545956, -1.94228905, -0.52558207,  1.5925728 , -0.16132855,
        -0.55087949, -0.74461081,  0.75291107, -0.82065181,  0.14079864,
        -0.62072967],
       [-1.74679706,  0.51485643, -1.6990351 , -0.10495034,  0.79657967,
        -0.55087949,  0.50147259, -1.5272218 , -0.82065181, -0.84676261,
         1.0456339 ]])

### *train_test_split*

In [33]:
y.value_counts()

HeartDisease
1    495
0    407
Name: count, dtype: int64

In [34]:
407/495

0.8222222222222222

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, random_state=88)

In [38]:
X_train.shape

(676, 11)

In [39]:
X_test.shape

(226, 11)

In [40]:
y_train.value_counts()

HeartDisease
1    371
0    305
Name: count, dtype: int64

In [41]:
305/371

0.8221024258760108

In [43]:
X.shape

(902, 11)

## *SVM without Bagging*

In [45]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [48]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_model.score(X_test, y_test)

0.8672566371681416

In [56]:
svm_scores = cross_val_score(SVC(), X_scaled, y, cv=5)
svm_scores

array([0.85082873, 0.87292818, 0.81111111, 0.80555556, 0.76111111])

In [57]:
svm_scores.mean()

np.float64(0.8203069367710251)

## *SVM with Bagging*

In [78]:
from sklearn.ensemble import BaggingClassifier
svm_bag_model = BaggingClassifier(
    estimator = SVC(),
    n_estimators = 500,
    max_samples = 0.8,
    oob_score = True,
    random_state = 8
)
svm_bag_model.fit(X_train, y_train)

In [79]:
svm_bag_model.oob_score_

0.8579881656804734

In [80]:
svm_bag_model.score(X_test, y_test)

0.8716814159292036

In [81]:
svm_bag_model = BaggingClassifier(
    estimator = SVC(),
    n_estimators = 1000,
    max_samples = 0.8,
    oob_score = True,
    random_state = 8
)
svm_bag_scores = cross_val_score(svm_bag_model, X_scaled, y, cv=5)

In [82]:
svm_bag_scores.mean()

np.float64(0.8225537139349294)

## *Decision Tree without Bagging*

In [83]:
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier()

In [84]:
tree_model.fit(X_train, y_train)
tree_model.score(X_test, y_test)

0.7964601769911505

In [85]:
tree_scores = cross_val_score(DecisionTreeClassifier(), X_scaled, y, cv=5)
tree_scores

array([0.74033149, 0.77900552, 0.78888889, 0.8       , 0.62222222])

In [86]:
tree_scores.mean()

np.float64(0.7460896255371394)

## *Decision Tree with Bagging*

In [109]:
tree_bag_model = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 888,
    max_samples = 0.8,
    oob_score = True,
    random_state = 8
)
tree_bag_model.fit(X_train, y_train)

In [110]:
tree_bag_model.oob_score_

0.856508875739645

In [111]:
tree_bag_model.score(X_test, y_test)

0.8584070796460177

In [112]:
tree_bag_model = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 1000,
    max_samples = 0.8,
    oob_score = True,
    random_state = 8
)
tree_bag_scores = cross_val_score(tree_bag_model, X_scaled, y, cv=5)

In [113]:
tree_bag_scores.mean()

np.float64(0.8092387968078576)

## *After all of this we come to a conclution that:*
### *1.Bagging works good on High Variance model eg:*
- *Decision Tree*
- *KNN*
- *High-Degree*
### *2.Bagging doesn't work on Low Variance model eg:* 
- *SVM*
- *KNN*
- *High-Degree*