In [None]:
# basic library
import numpy as np
import pandas as pd

# model
from sklearn.ensemble import AdaBoostClassifier

# custom modules
from utils import set_seed, get_clf_eval, make_submission, record_experimental_results
import preprocessing as pp

# sampling
from imblearn.under_sampling import RandomUnderSampler

### Global Setting

In [None]:
hparams = {
    'seed': 33
}

In [None]:
set_seed(hparams['seed'])

### 실험 01: `AdaBoostClassifier()`

In [None]:
adaboost_params = {
    'estimator': None,
    'n_estimators': 50,
    'learning_rate': 1.0,
    'algorithm': 'SAMME',
    'random_state': hparams['seed']
}

In [None]:
tr_data, tt_data = pp.load_data()
tr_data, tt_data = pp.label_encoding(tr_data, tt_data)

In [None]:
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(tr_data, seed=hparams['seed'])

In [None]:
abc = AdaBoostClassifier(**adaboost_params)

In [None]:
abc.fit(x_tr.fillna(0), y_tr)

In [None]:
y_val_pred = abc.predict(x_val.fillna(0))
get_clf_eval(y_val, y_val_pred)

In [None]:
x_tt = tt_data.drop(['is_converted', 'id'], axis=1)
y_test_pred = abc.predict(x_tt.fillna(0))
sum(y_test_pred)

### 실험 01: `AdaBoostClassifier()` ensemble

In [None]:
adaboost_params01 = {
    'estimator': None, # max_depth가 1인 DecisionTreeClassifier를 사용
    'n_estimators': 50,
    'learning_rate': 1.0,
    'algorithm': 'SAMME.R',
    # 'random_state': None (ensemble loop에서 직접 넘겨줄 예정)
}

In [None]:
num_models = 20 # ensemble할 모델 개수
test_results = np.zeros((num_models, 5271)) # 모델별 test 결과를 저장할 배열

In [None]:
# ensemble loop
for i in range(num_models):
    # data load & label encoding
    tr_data, tt_data = pp.load_data()
    x_tr, x_tt = pp.label_encoding(tr_data, tt_data)

    # 서로 다른 seed를 이용하여 undersampling 수행
    rus = RandomUnderSampler(random_state=hparams['seed'] + i)
    x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

    # train / validation split
    x_tr_res['is_converted'] = y_tr_res # concat
    x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

    # define a model
    model = AdaBoostClassifier(**adaboost_params01, 
                               random_state=hparams['seed'] + i)

    # training
    model.fit(x_tr.fillna(0), y_tr)

    # test
    x_tt = x_tt.drop(['is_converted', 'id'], axis=1)
    y_test_pred = model.predict(x_tt.fillna(0))

    # 예측 결과를 array에 누적
    test_results[i, :] = y_test_pred
    

    ### print result of current model ###
    print('-' * 20)
    print(f'Model {i + 1} results')
    print('-' * 20)

    print(f'current seed: {hparams["seed"] + i}')

    # check validation score
    y_val_pred = model.predict(x_val.fillna(0))
    get_clf_eval(y_val, y_val_pred)

    # number of positive predictions
    print(sum(y_test_pred))
    print()

In [None]:
# hard voting -> 모델별 예측 결과 (1 또는 0) 를 모두 더한 뒤, 합이 int(num_models / 2) + 1 이상이면 1 (positive), 미만이면 0 (negative) 로 예측
tmp = np.sum(test_results, axis=0, dtype=int)
final_test_pred = np.array([1 if x >= int(num_models / 2) + 1 else 0 for x in tmp])

In [None]:
sum(final_test_pred)

In [None]:
model_name = 'params01_adaboost_20'

In [None]:
make_submission(dir_name='04_adaboost',
                y_pred=final_test_pred,
                model_name=model_name)

**Record**

In [None]:
record_experimental_results(model_name=model_name,
                            test_f1_score='0.6070087609511889',
                            description='params01 세팅의 adaboost를, undersampling을 통해 만든 서로 다른 20개의 subset에 대해 학습시킨 뒤, 최종 앙상블')