### Library import

In [None]:
# basic library
import numpy as np
import pandas as pd

# models
from sklearn.tree import DecisionTreeClassifier

# custom modules
from utils import set_seed, get_clf_eval, make_submission, record_experimental_results
import preprocessing as pp

### Global Setting

In [None]:
hparams = {
    'seed': 33,
}

In [None]:
set_seed(hparams['seed'])

---

### 실험 01: Baseline

In [None]:
# data 불러오기
tr_data, tt_data = pp.load_data()

**Data preprocessing**

In [None]:
def baseline_pp():
    # label encoding
    x_tr, x_tt = pp.label_encoding(tr_data, tt_data)

    # train / validation split
    x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr, seed=hparams['seed'])

    return ((x_tr, y_tr, x_val, y_val), x_tt)

In [None]:
data_tr, x_tt = baseline_pp()
x_tr, y_tr, x_val, y_val = data_tr

**Select a model**

In [None]:
model = DecisionTreeClassifier()

**Training**

In [None]:
model.fit(x_tr.fillna(0), y_tr)

**Check validation score**

In [None]:
y_val_pred = model.predict(x_val.fillna(0))

In [None]:
get_clf_eval(y_val, y_val_pred)

**Make a submission file**

In [None]:
x_tt = x_tt.drop(['is_converted', 'id'], axis=1)

In [None]:
y_test_pred = model.predict(x_tt.fillna(0))
sum(y_test_pred)

In [None]:
make_submission(dir_name='01_decision_tree',
                f1_val=0.7917,
                y_pred=y_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='decision_tree',
                            test_f1_score='0.4754558204768583',
                            description='모든 feature 사용. Label encoding 외 다른 전처리 X. 결측치는 0으로 채움. 시드 33 사용. Recall > Precision.')

---

### 실험 02: 결측치 비율이 높거나 correlation이 높은 features  삭제 후 학습
- 결측치 비율 0.5 이상인 features 삭제
- `customer_country.1` feature 삭제 (`customer_country` feature와 correlation이 1에 가까움)

**Data preprocessing**

In [None]:
# data 불러오기
tr_data, tt_data = pp.load_data()

# 결측치 비율이 높은 feature를 삭제
tr_data, tt_data = pp.delete_features(tr_data, tt_data)

In [None]:
# 남은 feature 중 범주형 feature를 확인
cat_features = []
for col_name in tr_data.columns:
    if tr_data[col_name].dtype == 'object':
        cat_features.append(col_name)

cat_features

In [None]:
# 범주형 features를 수치형 feature로 encoding 후 train / validation split
tr_data, tt_data = pp.label_encoding(tr_data, tt_data, features=cat_features)
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(tr_data, seed=hparams['seed'])

In [None]:
# Check
print(f'number of original data: {len(tr_data)}')
print('-' * 20)
print(f'number of training data: {len(x_tr)}')
print(f'number of validation data: {len(x_val)}')
print(f'number of test data: {len(tt_data)}')

**Select a model**

In [None]:
model = DecisionTreeClassifier()

**Training**

In [None]:
model.fit(x_tr.fillna(0), y_tr)

**Check validation score**

In [None]:
y_val_pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, y_val_pred)

**Make a submission file**

In [None]:
x_tt = tt_data.drop(['is_converted', 'id'], axis=1)

In [None]:
y_test_pred = model.predict(x_tt.fillna(0))
sum(y_test_pred)

In [None]:
make_submission(dir_name='01_decision_tree',
                f1_val=0.7553,
                y_pred=y_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='decision_tree',
                            test_f1_score='0.3950995405819296',
                            description='절반 정도의 feature만 사용. Label encoding 외 다른 전처리 X. 결측치는 0으로 채움. 시드 33 사용. Recall > Precision. 이전 실험 대비 validation f1 score 감소')

---

### 실험 03: `customer_country` feature 정규화 후 학습
- `customer_country` feature와 correlation이 높은 `customer_country.1` feature는 삭제
- 그 외 feature는 그대로 사용하며, 결측치는 0으로 채움

**Data preprocessing**

In [None]:
tr_data, tt_data = pp.load_data()
tr_data, tt_data = pp.delete_features(tr_data, tt_data, features=['customer_country.1'])

In [None]:
tr_data['customer_country'] = pp.normalize_country_name(tr_data['customer_country'])
tt_data['customer_country'] = pp.normalize_country_name(tt_data['customer_country'])

print(tr_data['customer_country'].head())
print(tt_data['customer_country'].head())

In [None]:
# 남은 feature 중 범주형 feature를 확인
cat_features = []
for col_name in tr_data.columns:
    if tr_data[col_name].dtype == 'object':
        cat_features.append(col_name)

cat_features

In [None]:
tr_data, tt_data = pp.label_encoding(tr_data, tt_data, features=cat_features)
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(tr_data, seed=hparams['seed'])

**Training**

In [None]:
model = DecisionTreeClassifier()

In [None]:
model.fit(x_tr.fillna(0), y_tr)

**Check validation score**

In [None]:
y_val_pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, y_val_pred)

**Make a submission file**

In [None]:
x_tt = tt_data.drop(['is_converted', 'id'], axis=1)

In [None]:
y_test_pred = model.predict(x_tt.fillna(0))
sum(y_test_pred)

In [None]:
make_submission(dir_name='01_decision_tree',
                f1_val=0.7780,
                y_pred=y_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='decision_tree',
                            test_f1_score='0.42424242424242425',
                            description='customer_country.1 feature 삭제. customer_country feature 정규화. Label encoding 외 다른 전처리 X. 결측치는 0으로 채움. 시드 33 사용. 실험 01 대비 validation f1 score 감소')

---

### 실험 04: GridSearchCV 이용
- feature 전처리를 하면 할수록 성능이 떨어지고 있기 때문에, feature 전처리 X
- 동일한 모델을 사용하되, `GridSearchCV()`를 사용하여 DecisionTreeClasifier에 대한 최적의 parameter를 찾아본다.
- 실험 01에서, validation f1 score에 비해 test f1 score가 낮게 나온 이유가 training data에 overfitting 되었기 때문이 아닌가,, 생각이 들었음
- grid search로 validation score가 가장 좋은 hparam을 찾는다면, overfitting 문제를 해결하면서 test f1 score를 높일 수 있을 것 같음.
- 만약 이 실험으로도 test f1 score가 낮게 나온다면, 아마도 training data와 test data의 is_converted 비율이 극도로 달라서 생기는 문제가 아닐까.. 싶음

**Data preprocessing**

In [None]:
# data 불러오기
tr_data, tt_data = pp.load_data()

In [None]:
# label encoding
tr_data, tt_data = pp.label_encoding(tr_data, tt_data)

# train / validation split
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(tr_data, seed=hparams['seed'])

**Training**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
dtc_hparams = {
    'max_depth': np.arange(2, 20, 2).tolist(),
    'min_samples_split': np.arange(2, 20, 2).tolist(),
    'min_samples_leaf': np.arange(2, 20, 2).tolist(),
    'random_state': [hparams['seed']],
}

In [None]:
dtc = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=dtc_hparams, refit=True,
                   cv=5, verbose=4)

In [None]:
dtc.fit(x_tr.fillna(0), y_tr)

In [None]:
dtc.best_params_

**Check validation score**

In [None]:
y_val_pred = dtc.predict(x_val.fillna(0))
get_clf_eval(y_val, y_val_pred)

**Make a submission file**

In [None]:
x_tt = tt_data.drop(['is_converted', 'id'], axis=1)

In [None]:
y_test_pred = dtc.predict(x_tt.fillna(0))
sum(y_test_pred)

In [None]:
make_submission(dir_name='01_decision_tree',
                f1_val=0.7536,
                y_pred=y_test_pred)

**Record**

In [None]:
record_experimental_results(model_name='decision_tree',
                            test_f1_score='0.3262032085561497',
                            description='GridSearchCV 사용. Label encoding 외 다른 전처리 X. 결측치는 0으로 채움. 시드 33 사용. 실험 01 대비 validation f1 score 감소')