### Exercise 3: 

Compare the performance of the following algorihtms on the Heart Failure Prediction Dataset (`heart.csv`):

+ Logistic Regression
+ k-Nearest-Neighbor Classifier
+ Decision Tree Classifier
+ SVM
+ Gradient Boosting
+ AdaBoost

Optimize the parameters of each algorithm. Use a holdout-validation as test protocol with split size 0.2 for the test set. 

Details on the dataset: [[>]](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction)

---
### Imports

In [1]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd

---
### Dataset

In [2]:
df = pd.read_csv('./data/heart.csv', sep=',')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
None


In [3]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

sc = StandardScaler()
sc.fit(X_train)
X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

---
### Parameter Optimization and Holdout-Validation

In [4]:
def run(clf, param_grid, X_train, y_train, X_test, y_test):

    if param_grid: 
        grid_search = GridSearchCV(clf, param_grid=param_grid)
        grid_search.fit(X_train, y_train)
        clf.set_params(**grid_search.best_params_)

    clf.fit(X_train, y_train)
    train_acc = 100. * clf.score(X_train, y_train)
    test_acc  = 100. * clf.score(X_test, y_test)
    print(f'train accuracy : {train_acc:.2f}%')
    print(f'test accuracy  : {test_acc:.2f}%')

---
### Logistic Regression

In [5]:
run(LogisticRegression(), None, X_train_scaled, y_train, X_test_scaled, y_test)

train accuracy : 85.12%
test accuracy  : 80.33%


---
### k-Nearest-Neighbor Classifier

In [6]:
max_k = max(1, int(np.sqrt(len(y_train))))
param_grid = {
    'n_neighbors' : np.arange(1, max_k),
    'weights' : ['uniform', 'distance']
}
run(KNeighborsClassifier(), 
    param_grid, 
    X_train_scaled, y_train, 
    X_test_scaled, y_test)

train accuracy : 84.71%
test accuracy  : 81.97%


---
### Decision Tree

In [7]:
param_grid = {
    "max_depth": np.arange(1, 20)
}
run(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)

train accuracy : 85.95%
test accuracy  : 78.69%


---
### SVM

In [8]:
param_grid = {
    'gamma':2.**np.arange(-14, 4), 
    'C': 2.**np.arange(-4, 14)
}
run(SVC(), param_grid, X_train, y_train, X_test, y_test)

train accuracy : 86.36%
test accuracy  : 78.69%


---
### Gradient Boosting

In [9]:
params = {
    'n_estimators' : 300,
    'loss' : 'exponential',
    'criterion' : 'squared_error',
}

param_grid = {
    "max_depth": np.arange(1, 6),
    "learning_rate": 2.**np.arange(-10, 1)
}

clf = GradientBoostingClassifier(**params)
run(clf, param_grid, X_train, y_train, X_test, y_test)

train accuracy : 86.36%
test accuracy  : 83.61%


---
### AdaBoost

In [10]:
baselearner = [DecisionTreeClassifier(max_depth=depth) for depth in range(1, 6)]
param_grid = {
    "estimator": baselearner,
    "learning_rate": 2.**np.arange(-10, 1)
}

clf = AdaBoostClassifier(n_estimators=300)
run(clf, param_grid, X_train, y_train, X_test, y_test)

train accuracy : 86.36%
test accuracy  : 83.61%
