### Library import

In [None]:
# basic library
import numpy as np
import pandas as pd
from collections import Counter

# models
from sklearn.tree import DecisionTreeClassifier

# custom modules
from utils import set_seed, get_clf_eval, make_submission, record_experimental_results
import preprocessing as pp

# preprocessing
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# visualization
import matplotlib.pyplot as plt

### Global Setting

In [None]:
hparams = {
    'seed': 33,
}

In [None]:
set_seed(hparams['seed'])

### 실험 01: Positive sample oversampling

**Data preprocessing**

In [None]:
# data load & label encoding
tr_data, tt_data = pp.load_data()
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)

In [None]:
# oversampling
ros = RandomOverSampler(random_state=hparams['seed'])
x_tr_res, y_tr_res = ros.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

In [None]:
print(f'[Before oversampling] size of training data : {len(x_tr)}')
print(f'[After oversampling] size of training data : {len(x_tr_res)}')
print('-' * 30)
print(f'[After oversampling] distribution of training data : {Counter(y_tr_res)}')

In [None]:
# train / validation split
x_tr_res['is_converted'] = y_tr_res # concat
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

**Select a model**
- `DecisionTreeClassifier()`

In [None]:
model = DecisionTreeClassifier()

**Training**

In [None]:
model.fit(x_tr.fillna(0), y_tr)

**Check validation score**

In [None]:
y_val_pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, y_val_pred)

**Make a submission file**

In [None]:
x_tt = x_tt.drop(['is_converted', 'id'], axis=1)

In [None]:
y_test_pred = model.predict(x_tt.fillna(0))
sum(y_test_pred)

In [None]:
make_submission(dir_name='02_use_sampler',
                f1_val=0.9902,
                y_pred=y_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='decision_tree_w_oversampling',
                            test_f1_score='0.39189189189189194',
                            description='모든 feature 사용/전처리 X/RandomOverSampler 적용/결측치는 0으로 채움/Seed 33/training data size: 108898')

---

### 실험 02: Negative sample undersampling

In [None]:
# data load & label encoding
tr_data, tt_data = pp.load_data()
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)

In [None]:
# undersampling
rus = RandomUnderSampler(random_state=hparams['seed'])
x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

In [None]:
print(f'[Before undersampling] size of training data : {len(x_tr)}')
print(f'[After undersampling] size of training data : {len(x_tr_res)}')

from collections import Counter

print(f'[After undersampling] distribution of training data : {Counter(y_tr_res)}')

In [None]:
# train / validation split
x_tr_res['is_converted'] = y_tr_res # concat
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

In [None]:
print(f'training data size : {len(x_tr)}, validation data size: {len(x_val)}')

**Select a model**
- `DecisionTreeClassifier()`

In [None]:
model = DecisionTreeClassifier()

**Training**

In [None]:
model.fit(x_tr.fillna(0), y_tr)

**Check validation score**

In [None]:
y_val_pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, y_val_pred)

**Make a submission file**

In [None]:
x_tt = x_tt.drop(['is_converted', 'id'], axis=1)

In [None]:
y_test_pred = model.predict(x_tt.fillna(0))
sum(y_test_pred)

In [None]:
make_submission(dir_name='02_use_sampler',
                f1_val=0.8846,
                y_pred=y_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='decision_tree_w_undersampling',
                            test_f1_score='0.5867215645908389',
                            description='모든 feature 사용/전처리 X/RandomUnderSampler 적용/결측치는 0으로 채움/Seed 33/training data size: 7760')

---

### 실험 03: Ensemble with undersampling

In [None]:
num_models = 30 # ensemble할 모델 개수
test_results = np.zeros((num_models, len(x_tt))) # 모델별 test 결과를 저장할 배열

In [None]:
# ensemble loop
for i in range(num_models):
    # data load & label encoding
    x_tr, x_tt = pp.label_encoding(tr_data, tt_data)

    # 서로 다른 seed를 이용하여 undersampling 수행
    rus = RandomUnderSampler(random_state=hparams['seed'] + i)
    x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

    # train / validation split
    x_tr_res['is_converted'] = y_tr_res # concat
    x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

    # define a model
    model = DecisionTreeClassifier(random_state=hparams['seed'] + i)

    # training
    model.fit(x_tr.fillna(0), y_tr)

    # test
    x_tt = x_tt.drop(['is_converted', 'id'], axis=1)
    y_test_pred = model.predict(x_tt.fillna(0))

    # 예측 결과를 array에 누적
    test_results[i, :] = y_test_pred
    

    ### print result of current model ###
    print('-' * 20)
    print(f'Model {i + 1} results')
    print('-' * 20)

    print(f'current seed: {hparams["seed"] + i}')

    # check validation score
    y_val_pred = model.predict(x_val.fillna(0))
    get_clf_eval(y_val, y_val_pred)

    # number of positive predictions
    print(sum(y_test_pred))
    print()

In [None]:
# hard voting -> 모델별 예측 결과 (1 또는 0) 를 모두 더한 뒤, 합이 int(num_models / 2) + 1 이상이면 1 (positive), 미만이면 0 (negative) 로 예측
tmp = np.sum(test_results, axis=0, dtype=int)
final_test_pred = np.array([1 if x >= int(num_models / 2) + 1 else 0 for x in tmp])

In [None]:
sum(final_test_pred)

In [None]:
make_submission(dir_name='02_use_sampler',
                f1_val=0.89097,
                y_pred=final_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='ensemble_decisiontree_w_undersampling',
                            test_f1_score='0.6696230598669624',
                            description='30개 decision tree 사용/seed33~62/mean validation f1 score 0.89097/hard voting/결측치 0으로 처리')

---

### 실험 04: Decision tree with undersampling & feature preprocessing

**Preprocessing01: `customer_country.1` feature 삭제**

In [None]:
# data load & label encoding & feature delete
tr_data, tt_data = pp.load_data()
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)
x_tr, x_tt = pp.delete_features(x_tr, x_tt, features=['customer_country.1'])

print(len(tr_data.columns), len(x_tr.columns))

**Preprocessing02: feature별 결측치 비율을 확인한 뒤, 비율이 높은 feature 삭제**

In [None]:
tr_data, tt_data = pp.load_data()
tmp = pd.DataFrame(tr_data.isna().sum() / len(tr_data), columns=['nan_ratio'])

In [None]:
# 결측치 비율이 80% 이상인 feature 확인
tmp[tmp['nan_ratio'] >= 0.8].index

In [None]:
# 80% 이상이 결측치인 feature + `customer_country.1` feature 삭제
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)
x_tr, x_tt = pp.delete_features(x_tr, x_tt, features=['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
       'product_subcategory', 'product_modelname', 'business_subarea','customer_country.1'])

**Preprocessing03: `customer_country` feature normalization**

In [None]:
tr_data, tt_data = pp.load_data()

# feature 정규화
tr_data_cp, tt_data_cp = tr_data.copy(), tt_data.copy()
tr_data_cp['customer_country'] = pp.normalize_country_name(tr_data['customer_country'])
tt_data_cp['customer_country'] = pp.normalize_country_name(tt_data['customer_country'])

# label encoding & customer_country.1 feature 삭제
x_tr, x_tt = pp.label_encoding(tr_data_cp, tt_data_cp)
x_tr, x_tt = pp.delete_features(x_tr, x_tt, features=['customer_country.1'])

In [None]:
print(len(tr_data['customer_country'].unique()))
print(len(x_tr['customer_country'].unique()))

In [None]:
# undersampling
rus = RandomUnderSampler(random_state=hparams['seed'])
x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

In [None]:
print(f'[Before undersampling] size of training data : {len(x_tr)}')
print(f'[After undersampling] size of training data : {len(x_tr_res)}')

from collections import Counter

print(f'[After undersampling] distribution of training data : {Counter(y_tr_res)}')

In [None]:
# train / validation split
x_tr_res['is_converted'] = y_tr_res # concat
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

In [None]:
print(f'training data size : {len(x_tr)}, validation data size: {len(x_val)}')

**Select a model**
- `DecisionTreeClassifier()`

In [None]:
model = DecisionTreeClassifier()

**Training**

In [None]:
model.fit(x_tr.fillna(0), y_tr)

**Check validation score**

In [None]:
y_val_pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, y_val_pred)

**Make a submission file**

In [None]:
x_tt = x_tt.drop(['is_converted', 'id'], axis=1)

In [None]:
y_test_pred = model.predict(x_tt.fillna(0))
sum(y_test_pred)

In [None]:
make_submission(dir_name='02_use_sampler',
                f1_val=0.8907,
                y_pred=y_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='decision_tree_w_undersampling_feature_normalization',
                            test_f1_score='0.5541035023523261',
                            description='"customer_country" 정규화/"customer_country.1" feature 삭제/RandomUnderSampler 적용/결측치는 0으로 채움/Seed 33/training data size: 7760')

---

### 실험 05: Cost-Complexity Pruning

**`ccp_alpha` 적정값 찾아보기**

In [None]:
# data load & label encoding
tr_data, tt_data = pp.load_data()
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)

# 서로 다른 seed를 이용하여 undersampling 수행
rus = RandomUnderSampler(random_state=hparams['seed'])
x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

# train / validation split
x_tr_res['is_converted'] = y_tr_res # concat
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

In [None]:
model = DecisionTreeClassifier(random_state=hparams['seed'])

In [None]:
path = model.cost_complexity_pruning_path(x_tr.fillna(0), y_tr)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=hparams['seed'], ccp_alpha=ccp_alpha)
    clf.fit(x_tr.fillna(0), y_tr)
    clfs.append(clf)

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

train_scores = [clf.score(x_tr.fillna(0), y_tr) for clf in clfs]
test_scores = [clf.score(x_val.fillna(0), y_val) for clf in clfs]

print(max(test_scores))
print(f'best ccp_alpha: {ccp_alphas[test_scores.index(max(test_scores))]}')

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

- `0.00045` 값을 ccp_alpha 값으로 설정하여 실험해본다.

---

In [None]:
num_models = 30 # ensemble할 모델 개수
test_results = np.zeros((num_models, 5271)) # 모델별 test 결과를 저장할 배열

In [None]:
# ensemble loop
for i in range(num_models):
    # data load & label encoding
    tr_data, tt_data = pp.load_data()
    x_tr, x_tt = pp.label_encoding(tr_data, tt_data)

    # 서로 다른 seed를 이용하여 undersampling 수행
    rus = RandomUnderSampler(random_state=hparams['seed'] + i)
    x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

    # train / validation split
    x_tr_res['is_converted'] = y_tr_res # concat
    x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

    # define a model
    model = DecisionTreeClassifier(random_state=hparams['seed'] + i,
                                   ccp_alpha=0.00045)

    # training
    model.fit(x_tr.fillna(0), y_tr)

    # test
    x_tt = x_tt.drop(['is_converted', 'id'], axis=1)
    y_test_pred = model.predict(x_tt.fillna(0))

    # 예측 결과를 array에 누적
    test_results[i, :] = y_test_pred
    

    ### print result of current model ###
    print('-' * 20)
    print(f'Model {i + 1} results')
    print('-' * 20)

    print(f'current seed: {hparams["seed"] + i}')

    # check validation score
    y_val_pred = model.predict(x_val.fillna(0))
    get_clf_eval(y_val, y_val_pred)

    # number of positive predictions
    print(sum(y_test_pred))
    print()

In [None]:
# hard voting -> 모델별 예측 결과 (1 또는 0) 를 모두 더한 뒤, 합이 int(num_models / 2) + 1 이상이면 1 (positive), 미만이면 0 (negative) 로 예측
tmp = np.sum(test_results, axis=0, dtype=int)
final_test_pred = np.array([1 if x >= int(num_models / 2) + 1 else 0 for x in tmp])

In [None]:
sum(final_test_pred)

In [None]:
make_submission(dir_name='02_use_sampler',
                f1_val=0.8960,
                y_pred=final_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='ensemble_decisiontree_w_undersampling_and_ccp',
                            test_f1_score='0.7121752419765665',
                            description='30개 decision tree 사용, decision tree마다 seed 세팅/seed33~62/ccp_alpha 0.00045 사용/mean validation f1 score 0.8960/hard voting/결측치 0으로 처리/')

---

### 실험 06: Cost-Complexity Pruning + `customer_country.1` feature 삭제

**`ccp_alpha` 적정값 찾아보기**

In [None]:
# data load & label encoding & delete feature
tr_data, tt_data = pp.load_data()
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)
x_tr, x_tt = pp.delete_features(x_tr, x_tt, features=['customer_country.1'])

# 서로 다른 seed를 이용하여 undersampling 수행
rus = RandomUnderSampler(random_state=hparams['seed'])
x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

# train / validation split
x_tr_res['is_converted'] = y_tr_res # concat
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

In [None]:
model = DecisionTreeClassifier(random_state=hparams['seed'])

In [None]:
path = model.cost_complexity_pruning_path(x_tr.fillna(0), y_tr)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=hparams['seed'], ccp_alpha=ccp_alpha)
    clf.fit(x_tr.fillna(0), y_tr)
    clfs.append(clf)

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

train_scores = [clf.score(x_tr.fillna(0), y_tr) for clf in clfs]
test_scores = [clf.score(x_val.fillna(0), y_val) for clf in clfs]

print(max(test_scores))
print(f'best ccp_alpha: {ccp_alphas[test_scores.index(max(test_scores))]}')

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

- `ccp_alpha` 값으로 0.0004 사용

---

In [None]:
num_models = 30 # ensemble할 모델 개수
test_results = np.zeros((num_models, 5271)) # 모델별 test 결과를 저장할 배열

In [None]:
# ensemble loop
for i in range(num_models):
    # data load & label encoding & delete features
    tr_data, tt_data = pp.load_data()
    x_tr, x_tt = pp.label_encoding(tr_data, tt_data)
    tr_data, tt_data = pp.delete_features(x_tr, x_tt, features=['customer_country.1'])

    # 서로 다른 seed를 이용하여 undersampling 수행
    rus = RandomUnderSampler(random_state=hparams['seed'] + i)
    x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

    # train / validation split
    x_tr_res['is_converted'] = y_tr_res # concat
    x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

    # define a model
    model = DecisionTreeClassifier(random_state=hparams['seed'] + i,
                                   ccp_alpha=0.0004)

    # training
    model.fit(x_tr.fillna(0), y_tr)

    # test
    x_tt = x_tt.drop(['is_converted', 'id'], axis=1)
    y_test_pred = model.predict(x_tt.fillna(0))

    # 예측 결과를 array에 누적
    test_results[i, :] = y_test_pred
    

    ### print result of current model ###
    print('-' * 20)
    print(f'Model {i + 1} results')
    print('-' * 20)

    print(f'current seed: {hparams["seed"] + i}')

    # check validation score
    y_val_pred = model.predict(x_val.fillna(0))
    get_clf_eval(y_val, y_val_pred)

    # number of positive predictions
    print(sum(y_test_pred))
    print()

In [None]:
# hard voting -> 모델별 예측 결과 (1 또는 0) 를 모두 더한 뒤, 합이 int(num_models / 2) + 1 이상이면 1 (positive), 미만이면 0 (negative) 로 예측
tmp = np.sum(test_results, axis=0, dtype=int)
final_test_pred = np.array([1 if x >= int(num_models / 2) + 1 else 0 for x in tmp])

In [None]:
sum(final_test_pred)

In [None]:
make_submission(dir_name='02_use_sampler',
                f1_val=0.89571,
                y_pred=final_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='ensemble_decisiontree_w_undersampling_ccp_delFeature',
                            test_f1_score='0.7031170158405723',
                            description='30개 decision tree 사용, decision tree마다 seed 세팅/seed33~62/ccp_alpha 0.0004 사용/"customer_country.1" feature 삭제/mean validation f1 score 0.89571/hard voting/결측치 0으로 처리/')

---

### 실험 06+: CCP + `customer_country.1` feature 삭제 + 모델별 최적의 ccp_alpha 탐색

In [None]:
num_models = 30 # ensemble할 모델 개수
test_results = np.zeros((num_models, 5271)) # 모델별 test 결과를 저장할 배열

In [None]:
# ensemble loop
for i in range(num_models):
    # data load & label encoding & delete features
    tr_data, tt_data = pp.load_data()
    x_tr, x_tt = pp.label_encoding(tr_data, tt_data)
    tr_data, tt_data = pp.delete_features(x_tr, x_tt, features=['customer_country.1'])

    # 서로 다른 seed를 이용하여 undersampling 수행
    rus = RandomUnderSampler(random_state=hparams['seed'] + i)
    x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

    # train / validation split
    x_tr_res['is_converted'] = y_tr_res # concat
    x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

    ### find a best ccp_alpha value ###
    model = DecisionTreeClassifier(random_state=hparams['seed'] + i)

    path = model.cost_complexity_pruning_path(x_tr.fillna(0), y_tr)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities

    clfs = []
    for ccp_alpha in ccp_alphas:
        clf = DecisionTreeClassifier(random_state=hparams['seed'], ccp_alpha=ccp_alpha)
        clf.fit(x_tr.fillna(0), y_tr)
        clfs.append(clf)

    clfs = clfs[:-1]
    ccp_alphas = ccp_alphas[:-1]

    train_scores = [clf.score(x_tr.fillna(0), y_tr) for clf in clfs]
    test_scores = [clf.score(x_val.fillna(0), y_val) for clf in clfs]

    best_ccp_alpha = ccp_alphas[test_scores.index(max(test_scores))]
    print(f'best ccp_alpha: {best_ccp_alpha}')
    ### find a best ccp_alpha value ###

    # define a model
    model = DecisionTreeClassifier(random_state=hparams['seed'] + i,
                                   ccp_alpha=best_ccp_alpha)

    # training
    model.fit(x_tr.fillna(0), y_tr)

    # test
    x_tt = x_tt.drop(['is_converted', 'id'], axis=1)
    y_test_pred = model.predict(x_tt.fillna(0))

    # 예측 결과를 array에 누적
    test_results[i, :] = y_test_pred
    

    ### print result of current model ###
    print('-' * 20)
    print(f'Model {i + 1} results')
    print('-' * 20)

    print(f'current seed: {hparams["seed"] + i}')

    # check validation score
    y_val_pred = model.predict(x_val.fillna(0))
    get_clf_eval(y_val, y_val_pred)

    # number of positive predictions
    print(sum(y_test_pred))
    print()

In [None]:
# hard voting -> 모델별 예측 결과 (1 또는 0) 를 모두 더한 뒤, 합이 int(num_models / 2) + 1 이상이면 1 (positive), 미만이면 0 (negative) 로 예측
tmp = np.sum(test_results, axis=0, dtype=int)
final_test_pred = np.array([1 if x >= int(num_models / 2) + 1 else 0 for x in tmp])

In [None]:
sum(final_test_pred)

In [None]:
make_submission(dir_name='02_use_sampler',
                f1_val=0.900290,
                y_pred=final_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='ensemble_decisiontree_w_undersampling_bestccpAlpha',
                            test_f1_score='0.6965020576131687',
                            description='30개 decision tree 사용, decision tree마다 seed 세팅/seed33~62/모델마다 best ccp alpha 찾아서 적용/mean validation f1 score 0.90029/hard voting/결측치 0으로 처리/')

---

### 실험 07: CCP + delete features that have a high nan ratio

**`ccp_alpha` 적정값 찾아보기**

In [None]:
# data load & label encoding & delete feature
tr_data, tt_data = pp.load_data()
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)
x_tr, x_tt = pp.delete_features(x_tr, x_tt, features=['customer_country.1', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',])

# 서로 다른 seed를 이용하여 undersampling 수행
rus = RandomUnderSampler(random_state=hparams['seed'])
x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

# train / validation split
x_tr_res['is_converted'] = y_tr_res # concat
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

In [None]:
model = DecisionTreeClassifier(random_state=hparams['seed'])

In [None]:
path = model.cost_complexity_pruning_path(x_tr.fillna(0), y_tr)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=hparams['seed'], ccp_alpha=ccp_alpha)
    clf.fit(x_tr.fillna(0), y_tr)
    clfs.append(clf)

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

train_scores = [clf.score(x_tr.fillna(0), y_tr) for clf in clfs]
test_scores = [clf.score(x_val.fillna(0), y_val) for clf in clfs]

print(max(test_scores))
print(f'best ccp_alpha: {ccp_alphas[test_scores.index(max(test_scores))]}')

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

- `ccp_alpha` 값으로 0.00046 사용

---

In [None]:
num_models = 30 # ensemble할 모델 개수
test_results = np.zeros((num_models, 5271)) # 모델별 test 결과를 저장할 배열

In [None]:
# ensemble loop
for i in range(num_models):
    # data load & label encoding & delete features
    tr_data, tt_data = pp.load_data()
    x_tr, x_tt = pp.label_encoding(tr_data, tt_data)
    tr_data, tt_data = pp.delete_features(x_tr, x_tt, features=['customer_country.1', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver'])

    # 서로 다른 seed를 이용하여 undersampling 수행
    rus = RandomUnderSampler(random_state=hparams['seed'] + i)
    x_tr_res, y_tr_res = rus.fit_resample(x_tr.drop(['is_converted'], axis=1), x_tr['is_converted'])

    # train / validation split
    x_tr_res['is_converted'] = y_tr_res # concat
    x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

    # define a model
    model = DecisionTreeClassifier(random_state=hparams['seed'] + i,
                                   ccp_alpha=0.00046)

    # training
    model.fit(x_tr.fillna(0), y_tr)

    # test
    x_tt = x_tt.drop(['is_converted', 'id'], axis=1)
    y_test_pred = model.predict(x_tt.fillna(0))

    # 예측 결과를 array에 누적
    test_results[i, :] = y_test_pred
    

    ### print result of current model ###
    print('-' * 20)
    print(f'Model {i + 1} results')
    print('-' * 20)

    print(f'current seed: {hparams["seed"] + i}')

    # check validation score
    y_val_pred = model.predict(x_val.fillna(0))
    get_clf_eval(y_val, y_val_pred)

    # number of positive predictions
    print(sum(y_test_pred))
    print()

In [None]:
# hard voting -> 모델별 예측 결과 (1 또는 0) 를 모두 더한 뒤, 합이 5 이상이면 1 (positive), 5 미만이면 0 (negative) 로 예측
tmp = np.sum(test_results, axis=0, dtype=int)
final_test_pred = np.array([1 if x >= 5 else 0 for x in tmp])

In [None]:
sum(final_test_pred)

In [None]:
make_submission(dir_name='02_use_sampler',
                f1_val=0.89734,
                y_pred=final_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='ensemble_decisiontree_w_undersampling_ccp_delFeatures',
                            test_f1_score='0.693579766536965',
                            description='10개 decision tree 사용, decision tree마다 seed 세팅/seed33~52/ccp_alpha 0.00046 사용/"customer_country.1, id_strategic_ver, it_strategic_ver, idit_strategic_ver" feature 삭제/mean validation f1 score 0.89734/hard voting/결측치 0으로 처리/')

---