# Adaboost

In [1]:
import pandas as pd
from pycaret.datasets import get_data 
from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score

dataset = pd.read_csv('../dataset/master_3.csv')

In [2]:
X = dataset[dataset.columns.difference(['outcome_critical','intime','outtime','ed_los','outcome_icu_transfer_12h'])]
y = dataset['outcome_critical']

In [3]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 80% training and 20% test
 
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=0.1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)
 
# Predict the response for test dataset
y_pred = model.predict(X_test)
 
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
# print(cross_val_score(AdaBoostClassifier(), X, y)) # cross validation (k = 5)

Accuracy: 0.9439723815357203


In [None]:
# n_estimators 의 효과 측정

x = list(range(2, 102, 2))
y = []

for i in x:
  adaclf = AdaBoostClassifier(n_estimators=i,
                              learning_rate=0.5,
                              algorithm='SAMME.R',
                              random_state=1)
  
  adaclf.fit(X_train, y_train)
  adaclf_test_sc = metrics.accuracy_score(y_test, adaclf.predict(X_test))
  y.append(adaclf_test_sc)

import matplotlib.pyplot as plt
plt.style.use('ggplot')

plt.title("Effect of n_estimators", pad=20)
plt.xlabel("Number of base estimators")
plt.ylabel("Test accuracy of AdaBoost")
plt.plot(x, y)

In [None]:
# learning_rate 의 효과 측정

x = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
y = []

for i in x:
  adaclf = AdaBoostClassifier(n_estimators=50,
                              learning_rate=i,
                              algorithm='SAMME.R',
                              random_state=1)
  
  adaclf.fit(X_train, y_train)
  adaclf_test_sc = metrics.accuracy_score(y_test, adaclf.predict(X_test))
  y.append(adaclf_test_sc)

plt.title("Effect of learning_rate", pad=20)
plt.xlabel("Learning rate")
plt.ylabel("Test accuracy of AdaBoost")
plt.plot(x, y)

In [4]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score, confusion_matrix

def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))    

In [5]:
ada_clf = AdaBoostClassifier(n_estimators=30,
                              learning_rate=0.5,
                              algorithm='SAMME.R',
                              random_state=1)

In [6]:
ada_clf.fit(X_train, y_train)
pred = ada_clf.predict(X_test)

In [7]:
get_clf_eval(y_test, pred)

오차행렬:
 [[83399   392]
 [ 4047  1957]]

정확도: 0.9506
정밀도: 0.8331
재현율: 0.3259
F1: 0.4686
AUC: 0.6606
