In [None]:
# basic library
import numpy as np

# model
from sklearn.ensemble import GradientBoostingClassifier

# sampler
from imblearn.under_sampling import RandomUnderSampler

# custom modules
from utils import set_seed, get_clf_eval, make_submission, record_experimental_results
import preprocessing as pp

### Global Setting

In [None]:
hparams = {
    'seed': 33
}

In [None]:
set_seed(hparams['seed'])

In [None]:
gbm_hparams = {
    'loss': 'log_loss', # The loss function to be optimized.
    'learning_rate':0.1, # Learning rate shrinks the contribution of each tree by learning_rate. 
    'n_estimators': 100, # The number of boosting stages to perform.
    'subsample': 1.0, # The fraction of samples to be used for fitting the individual base learners.
    'criterion': 'friedman_mse', # The function to measure the quality of a split.
    'min_samples_split': 2, # The minimum number of samples required to split an internal node:
    'min_samples_leaf': 1, # The minimum number of samples required to be at a leaf node.
    'max_depth': 3, # Maximum depth of the individual regression estimators.
    'min_impurity_decrease': 0.0, # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
    'init': None, # An estimator object that is used to compute the initial predictions.
    # 'random_state': hparams['seed'], # Controls the random seed given to each Tree estimator at each boosting iteration.
    'max_features': None, # The number of features to consider when looking for the best split:
    'verbose': 0, # Enable verbose output.
    'max_leaf_nodes': None, # Grow trees with max_leaf_nodes in best-first fashion.
    'warm_start': False,
    'validation_fraction': 0.1, # The proportion of training data to set aside as validation set for early stopping.
    'n_iter_no_change': None, # n_iter_no_change is used to decide if early stopping will be used to terminate training when validation score is not improving.
    'tol': 1e-4, # Tolerance for the early stopping.
    'ccp_alpha': 0.0 # Complexity parameter used for Minimal Cost-Complexity Pruning.
}

### 실험 01: `GradientBoostingClassifier()`

In [None]:
# data load & label encoding
tr_data, tt_data = pp.load_data()
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr, seed=hparams['seed'])

In [None]:
# model
gbm = GradientBoostingClassifier(**gbm_hparams)

In [None]:
# training
gbm.fit(x_tr.fillna(0), y_tr)

In [None]:
# check validation score
y_val_pred = gbm.predict(x_val.fillna(0))
get_clf_eval(y_val, y_val_pred)

In [None]:
# test
x_tt = x_tt.drop(['is_converted', 'id'], axis=1)

In [None]:
y_test_pred = gbm.predict(x_tt.fillna(0))
sum(y_test_pred)

### 실험 02: `GradientBoostingClassifier()` ensemble

In [None]:
num_models = 30 # ensemble할 모델 개수
test_results = np.zeros((num_models, 5271)) # 모델별 test 결과를 저장할 배열

In [None]:
gbm_hparams02 = gbm_hparams.copy()
gbm_hparams02['ccp_alpha'] = 0.0004

In [None]:
gbm_hparams03 = gbm_hparams.copy()
gbm_hparams03['max_depth'] = 6

In [None]:
gbm_hparams04 = gbm_hparams03.copy()
gbm_hparams04['n_estimators'] = 200

In [None]:
gbm_hparams05 = gbm_hparams04.copy()
gbm_hparams05['n_estimators'] = 400

In [None]:
# ensemble loop
val_precision, val_recall, val_f1 = [], [], []

# data load & drop dupliccates & label encoding
tr_data, tt_data = pp.load_data()
tr_data.drop_duplicates(inplace=True)
tr_data, tt_data = pp.label_encoding(tr_data, tt_data)
x_tt = tt_data.drop(['is_converted', 'id'], axis=1)

for i in range(num_models):
    # 서로 다른 seed를 이용하여 undersampling 수행
    rus = RandomUnderSampler(random_state=hparams['seed'] + i)
    x_tr_res, y_tr_res = rus.fit_resample(tr_data.drop(['is_converted'], axis=1), tr_data['is_converted'])

    # train / validation split
    x_tr_res['is_converted'] = y_tr_res # concat
    x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

    # define a model
    model = GradientBoostingClassifier(**gbm_hparams05, 
                               random_state=hparams['seed'] + i)

    # training
    model.fit(x_tr.fillna(0), y_tr)

    # test
    y_test_pred = model.predict(x_tt.fillna(0))

    # 예측 결과를 array에 누적
    test_results[i, :] = y_test_pred
    

    ### print result of current model ###
    print('-' * 20)
    print(f'Model {i + 1} results')
    print('-' * 20)

    print(f'current seed: {hparams["seed"] + i}')

    # check validation score
    y_val_pred = model.predict(x_val.fillna(0))
    pr, re, f1 = get_clf_eval(y_val, y_val_pred, is_return=True)
    
    val_precision.append(pr)
    val_recall.append(re)
    val_f1.append(f1)

    # number of positive predictions
    print(sum(y_test_pred))
    print()

print(f'average validation precision score of {num_models} models: {sum(val_precision) / num_models:.6f}')
print(f'average validation recall score of {num_models} models: {sum(val_recall) / num_models:.6f}')
print(f'average validation f1 score of {num_models} models: {sum(val_f1) / num_models:.6f}')

In [None]:
# hard voting -> 모델별 예측 결과 (1 또는 0) 를 모두 더한 뒤, 합이 int(num_models / 2) + 1 이상이면 1 (positive), 미만이면 0 (negative) 로 예측
tmp = np.sum(test_results, axis=0, dtype=int)
final_test_pred = np.array([1 if x >= int(num_models / 2) + 1 else 0 for x in tmp])

In [None]:
sum(final_test_pred)

In [None]:
model_name = 'hparams05_gbm_30_inc_estimators_dropDuplicates'

In [None]:
make_submission(dir_name='05_gbm',
                y_pred=final_test_pred,
                model_name=model_name)

**Record**

In [None]:
record_experimental_results(model_name=model_name,
                            test_f1_score='0.7433920704845814',
                            description='hparams05 세팅 + 중복 데이터 (negative 3000개, positive 200개) 삭제')