### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, log_loss


### Data Importing

In [2]:
df = pd.read_csv("/kaggle/input/heart-failure-prediction/heart.csv")
data = df.copy()

#### Exploratery Data Analysis (EDA)

In [3]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


#### Define features and Target

In [4]:
X = data.iloc[:, :-1]
y= data.iloc[:, -1]

In [5]:
X.sample(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
390,51,M,ASY,140,0,0,Normal,60,N,0.0,Flat
699,57,M,ASY,110,201,0,Normal,126,Y,1.5,Flat
698,41,M,NAP,130,214,0,LVH,168,N,2.0,Flat
387,53,M,ASY,130,0,0,LVH,135,Y,1.0,Flat
806,70,M,ASY,145,174,0,Normal,125,Y,2.6,Down


#### Encode Object Data 

In [6]:
encoder = LabelEncoder()
object_cols = X.select_dtypes(include=['object'])
object_cols

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
0,M,ATA,Normal,N,Up
1,F,NAP,Normal,N,Flat
2,M,ATA,ST,N,Up
3,F,ASY,Normal,Y,Flat
4,M,NAP,Normal,N,Up
...,...,...,...,...,...
913,M,TA,Normal,N,Flat
914,M,ASY,Normal,N,Flat
915,M,ASY,Normal,Y,Flat
916,F,ATA,LVH,N,Flat


In [7]:
for col in object_cols.columns:
    X[col]=encoder.fit_transform(object_cols[col])

In [8]:
X.sample(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
333,40,1,0,95,0,1,2,144,0,0.0,2
818,51,1,0,140,299,0,1,173,1,1.6,2
341,64,1,0,110,0,1,1,114,1,1.3,0
601,61,1,2,140,284,0,1,123,1,1.3,1
279,54,0,1,130,253,0,2,155,0,0.0,2


#### Data splitting for train and test

In [9]:
X_train, X_test,y_train , y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [10]:
X_test

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
668,63,0,1,140,195,0,1,179,0,0.0,2
30,53,1,2,145,518,0,1,130,0,0.0,1
377,65,1,0,160,0,1,2,122,0,1.2,1
535,56,1,0,130,0,0,0,122,1,1.0,1
807,54,1,1,108,309,0,1,156,0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...
468,62,1,0,152,153,0,2,97,1,1.6,2
66,45,0,0,132,297,0,1,144,0,0.0,2
332,38,1,2,100,0,0,1,179,0,-1.1,2
375,73,0,2,160,0,0,2,121,0,0.0,2


#### Data Scaling 

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)
X_train

array([[ 0.26942112,  0.52786151,  1.30345598, ...,  1.20157687,
         0.85419999, -0.61468478],
       [-0.79895068,  0.52786151,  0.24687635, ...,  1.20157687,
        -0.8288434 ,  1.05547847],
       [-1.54681094,  0.52786151,  0.24687635, ...,  1.20157687,
         0.1061807 , -0.61468478],
       ...,
       [ 0.69676984,  0.52786151, -0.80970327, ...,  1.20157687,
         0.48019035,  1.05547847],
       [ 0.69676984,  0.52786151, -0.80970327, ...,  1.20157687,
        -0.8288434 ,  1.05547847],
       [-1.43997376, -1.8944363 , -0.80970327, ..., -0.83223972,
         1.04120481, -0.61468478]])

In [12]:
X_test

array([[ 1.01728138, -1.8944363 ,  0.24687635, ..., -0.83223972,
        -0.8288434 ,  1.05547847],
       [-0.05109042,  0.52786151,  1.30345598, ..., -0.83223972,
        -0.8288434 , -0.61468478],
       [ 1.23095574,  0.52786151, -0.80970327, ..., -0.83223972,
         0.29318553, -0.61468478],
       ...,
       [-1.65364812,  0.52786151,  1.30345598, ..., -0.83223972,
        -1.85736992,  1.05547847],
       [ 2.08565318, -1.8944363 ,  1.30345598, ..., -0.83223972,
        -0.8288434 ,  1.05547847],
       [-0.37160196,  0.52786151, -0.80970327, ...,  1.20157687,
        -0.36133135, -0.61468478]])

### Machine Learning Algorithms

#### Logistic Regression

In [13]:
lr = LogisticRegression(penalty='l2', C=1, max_iter=100)
lr.fit(X_train,y_train)

In [14]:
print("Train Accuracy:",lr.score(X_train, y_train))

Train Accuracy: 0.860774818401937


In [15]:
y_pred = lr.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))

Test Accuracy: 0.8586956521739131


In [16]:
print("Precision Score", precision_score(y_test, y_pred))
print("Recall Score", recall_score(y_test, y_pred))
print("F1 Score", f1_score(y_test, y_pred))

Precision Score 0.8867924528301887
Recall Score 0.8703703703703703
F1 Score 0.8785046728971964


#### SGD Classifier

In [17]:
sgd = SGDClassifier(penalty='l1',learning_rate='adaptive', eta0=0.0001)
sgd.fit(X_train, y_train)

##### Model Evaluation

In [18]:
print("Train Accuracy:",sgd.score(X_train, y_train))
y_pred = sgd.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 0.8535108958837773
Test Accuracy: 0.8369565217391305
Classification Report               precision    recall  f1-score   support

           0       0.78      0.84      0.81        38
           1       0.88      0.83      0.86        54

    accuracy                           0.84        92
   macro avg       0.83      0.84      0.83        92
weighted avg       0.84      0.84      0.84        92



#### SVM Classifier
[Link](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)

In [19]:
svc = SVC(C=3)
svc.fit(X_train, y_train)

##### Model Evaluation

In [20]:
print("Train Accuracy:",svc.score(X_train, y_train))
y_pred = svc.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 0.9249394673123487
Test Accuracy: 0.8913043478260869
Classification Report               precision    recall  f1-score   support

           0       0.83      0.92      0.88        38
           1       0.94      0.87      0.90        54

    accuracy                           0.89        92
   macro avg       0.89      0.90      0.89        92
weighted avg       0.90      0.89      0.89        92



#### Decision Tree Classifier
[Link](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)

In [21]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

##### Model Evaluation

In [22]:
print("Train Accuracy:",dtree.score(X_train, y_train))
y_pred = dtree.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 1.0
Test Accuracy: 0.8369565217391305
Classification Report               precision    recall  f1-score   support

           0       0.77      0.87      0.81        38
           1       0.90      0.81      0.85        54

    accuracy                           0.84        92
   macro avg       0.83      0.84      0.83        92
weighted avg       0.84      0.84      0.84        92



The decision tree model has been overfitted

#### K-Neighbors Classifier
[Link](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier)

In [23]:
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train, y_train)

##### Model Evaluation

In [24]:
print("Train Accuracy:",knn.score(X_train, y_train))
y_pred = knn.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 0.8753026634382567
Test Accuracy: 0.8695652173913043
Classification Report               precision    recall  f1-score   support

           0       0.84      0.84      0.84        38
           1       0.89      0.89      0.89        54

    accuracy                           0.87        92
   macro avg       0.87      0.87      0.87        92
weighted avg       0.87      0.87      0.87        92



### Ensemble Models

#### Random Forest Classifier
[Link](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier)

In [25]:
rfc = RandomForestClassifier(n_estimators=70, random_state=42)
rfc.fit(X_train, y_train)

##### Model Evaluation

In [26]:
print("Train Accuracy:",rfc.score(X_train, y_train))
y_pred = rfc.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 1.0
Test Accuracy: 0.8804347826086957
Classification Report               precision    recall  f1-score   support

           0       0.85      0.87      0.86        38
           1       0.91      0.89      0.90        54

    accuracy                           0.88        92
   macro avg       0.88      0.88      0.88        92
weighted avg       0.88      0.88      0.88        92



#### Bagging Classifier
[Link](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier)

In [27]:
bclf_rfc = BaggingClassifier(estimator=rfc,n_estimators=120, random_state=5 )
bclf_rfc.fit(X_train, y_train)

##### Model Evaluation

In [28]:
print("Train Accuracy:",bclf_rfc.score(X_train, y_train))
y_pred = bclf_rfc.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 0.9757869249394673
Test Accuracy: 0.8804347826086957
Classification Report               precision    recall  f1-score   support

           0       0.89      0.82      0.85        38
           1       0.88      0.93      0.90        54

    accuracy                           0.88        92
   macro avg       0.88      0.87      0.88        92
weighted avg       0.88      0.88      0.88        92



#### Extra Trees Classifier
[Link](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier)

In [29]:
etclf = ExtraTreesClassifier(n_estimators=350, random_state=5 )
etclf.fit(X_train, y_train)

In [30]:
print("Train Accuracy:",etclf.score(X_train, y_train))
y_pred = etclf.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 1.0
Test Accuracy: 0.8913043478260869
Classification Report               precision    recall  f1-score   support

           0       0.87      0.87      0.87        38
           1       0.91      0.91      0.91        54

    accuracy                           0.89        92
   macro avg       0.89      0.89      0.89        92
weighted avg       0.89      0.89      0.89        92



#### Adaptive Boost Classifier
using Bagging Classifier as an estimator


[Link](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier)


In [156]:
adb_clf = AdaBoostClassifier(estimator= bclf_rfc, n_estimators=100)
adb_clf.fit(X_train, y_train)

##### Model Evaluation

In [158]:
print("Train Accuracy:",adb_clf.score(X_train, y_train))
y_pred = adb_clf.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 0.9830508474576272
Test Accuracy: 0.8913043478260869
Classification Report               precision    recall  f1-score   support

           0       0.87      0.87      0.87        38
           1       0.91      0.91      0.91        54

    accuracy                           0.89        92
   macro avg       0.89      0.89      0.89        92
weighted avg       0.89      0.89      0.89        92



#### Voting Classifier
using SVC and Random Forest as estimators

[Link](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html#sklearn.ensemble.VotingClassifier)

In [57]:
voting_clf = VotingClassifier(estimators=[('SVC',svc),('Random Forest',rfc), ('LR',lr), ('ExtraTreesClf',etclf)], weights=[2,1,1,1]  )
voting_clf.fit(X_train, y_train)

##### Model Evaluation

In [58]:
print("Train Accuracy:",voting_clf.score(X_train, y_train))
y_pred = voting_clf.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 0.9297820823244553
Test Accuracy: 0.9021739130434783
Classification Report               precision    recall  f1-score   support

           0       0.87      0.89      0.88        38
           1       0.92      0.91      0.92        54

    accuracy                           0.90        92
   macro avg       0.90      0.90      0.90        92
weighted avg       0.90      0.90      0.90        92



#### Stacking Classifier
[Link](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html#sklearn.ensemble.StackingClassifier)

In [70]:
st_clf = StackingClassifier(estimators=[('SVC',svc),("LogisticRegression",lr),('Random Forest',rfc)])
st_clf.fit(X_train, y_train)

##### Model Evaluation

In [71]:
print("Train Accuracy:",st_clf.score(X_train, y_train))
y_pred = st_clf.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 0.9539951573849879
Test Accuracy: 0.8913043478260869
Classification Report               precision    recall  f1-score   support

           0       0.87      0.87      0.87        38
           1       0.91      0.91      0.91        54

    accuracy                           0.89        92
   macro avg       0.89      0.89      0.89        92
weighted avg       0.89      0.89      0.89        92



#### Extra Gradiant Boost Classifier
[Link](https://xgboost.readthedocs.io/en/stable/parameter.html)

In [107]:
xgbc = XGBClassifier(n_estimators=130, random_state=42, learning_rate=0.01)
xgbc.fit(X_train, y_train)

In [108]:
print("Train Accuracy:",xgbc.score(X_train, y_train))
y_pred = xgbc.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 0.937046004842615
Test Accuracy: 0.8913043478260869
Classification Report               precision    recall  f1-score   support

           0       0.89      0.84      0.86        38
           1       0.89      0.93      0.91        54

    accuracy                           0.89        92
   macro avg       0.89      0.88      0.89        92
weighted avg       0.89      0.89      0.89        92



#### Cat Boost Classifier
[Link](https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier)

In [131]:
catboost = CatBoostClassifier(learning_rate=0.01,random_seed=42, n_estimators=500)
catboost.fit(X_train, y_train)

0:	learn: 0.6839641	total: 1.5ms	remaining: 749ms
1:	learn: 0.6748070	total: 2.93ms	remaining: 730ms
2:	learn: 0.6669105	total: 4.28ms	remaining: 710ms
3:	learn: 0.6593535	total: 5.65ms	remaining: 701ms
4:	learn: 0.6517178	total: 6.95ms	remaining: 688ms
5:	learn: 0.6438603	total: 8.17ms	remaining: 673ms
6:	learn: 0.6371084	total: 9.42ms	remaining: 664ms
7:	learn: 0.6304239	total: 10.4ms	remaining: 637ms
8:	learn: 0.6230278	total: 11.5ms	remaining: 629ms
9:	learn: 0.6158020	total: 12.7ms	remaining: 622ms
10:	learn: 0.6091513	total: 14ms	remaining: 622ms
11:	learn: 0.6019573	total: 15.2ms	remaining: 619ms
12:	learn: 0.5957118	total: 16.5ms	remaining: 620ms
13:	learn: 0.5902175	total: 17.8ms	remaining: 619ms
14:	learn: 0.5838495	total: 19.1ms	remaining: 618ms
15:	learn: 0.5768315	total: 20.4ms	remaining: 616ms
16:	learn: 0.5709332	total: 21.6ms	remaining: 615ms
17:	learn: 0.5654075	total: 22.8ms	remaining: 611ms
18:	learn: 0.5590252	total: 24ms	remaining: 609ms
19:	learn: 0.5523922	total:

<catboost.core.CatBoostClassifier at 0x7a13661d37f0>

##### Model Evaluation

In [132]:
print("Train Accuracy:",catboost.score(X_train, y_train))
y_pred = catboost.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 0.940677966101695
Test Accuracy: 0.9130434782608695
Classification Report               precision    recall  f1-score   support

           0       0.92      0.87      0.89        38
           1       0.91      0.94      0.93        54

    accuracy                           0.91        92
   macro avg       0.91      0.91      0.91        92
weighted avg       0.91      0.91      0.91        92



#### LGBMClassifier
[Link](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html)

In [150]:
lgbm = LGBMClassifier(learning_rate=0.01, n_estimators=300, random_state=42)
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 454, number of negative: 372
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000230 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 826, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.549637 -> initscore=0.199203
[LightGBM] [Info] Start training from score 0.199203


In [151]:
print("Train Accuracy:",lgbm.score(X_train, y_train))
y_pred = lgbm.predict(X_test)
print("Test Accuracy:",accuracy_score(y_pred, y_test))
print("Classification Report",classification_report(y_test, y_pred))

Train Accuracy: 0.9346246973365617
Test Accuracy: 0.9021739130434783
Classification Report               precision    recall  f1-score   support

           0       0.89      0.87      0.88        38
           1       0.91      0.93      0.92        54

    accuracy                           0.90        92
   macro avg       0.90      0.90      0.90        92
weighted avg       0.90      0.90      0.90        92



### Conclusions:
> * The best test accuracy for this dataset is 91 by using the CatBoostClassifier and also LGBMClassifier
> * The second best accuracy is 89 by : 
SVC, 
Extra Tress Clf, 
AdaBoost CLF, 
vooting and stacking clf using (svc, random forest, LR),
XGBoost