In [None]:
# basic library
import numpy as np
import pandas as pd
from collections import Counter

# models
from sklearn.ensemble import RandomForestClassifier

# custom modules
from utils import set_seed, get_clf_eval, make_submission, record_experimental_results
import preprocessing as pp

# preprocessing
from imblearn.under_sampling import RandomUnderSampler

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# metrics
from sklearn.metrics import f1_score

### Global Setting

In [None]:
hparams = {
    'seed': 33,
}

In [None]:
set_seed(hparams['seed'])

### 실험 01: `RandomForestClassifier()`

**Data preprocessing**

In [None]:
# data load & label encoding
tr_data, tt_data = pp.load_data()
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)

In [None]:
y_tr = x_tr['is_converted']
x_tr = x_tr.drop(columns=['is_converted'])

**Model**

In [None]:
rfc_params_01 = {
    'n_estimators': 30, # 사용할 decision tree의 개수
    'criterion': 'gini', # statement의 분류 성능을 평가할 기준
    'max_depth': None, # tree의 최대 깊이
    'min_samples_split': 2, # internal node를 나누기 위해 필요한 최소 샘플 개수 (이 값 이하면 split X)
    'min_samples_leaf': 1, # min_samples_leaf 이상의 samples을 가져야만 leaf node로 간주됨 (?)
    'min_weight_fraction_leaf': 0.0, # ?
    'max_features': None, # best split을 찾기 위해 고려할 features의 개수
    'max_leaf_nodes': None, # ?
    'min_impurity_decrease': 0.0, # 특정 node를 split할 때, impurity가 이 값 이상 감소해야만 split을 수행
    'bootstrap': True, # sampling을 통해 만든 subset 간 데이터 중복을 허용하는지 여부. (False이면 모든 tree가 동일한 dataset 이용)
    'oob_score': True, # out-of-bag sample을 이용해 일반화 성능을 측정할 때 사용 (bootstrap==True일 때만 사용가능)
    'n_jobs': None, # 작업을 병렬적으로 수행하고 싶을 때 사용 (-1이면 모든 CPU 코어를 사용)
    'random_state': hparams['seed'], # 모델링 과정에 필요한 randomness를 부여할 때 사용할 값
    'verbose': 1, # fitting 과정에 대한 정보를 어느 정도로 출력할 지 결정
    'warm_start': False, # 이전에 만들었던 forest가 존재한다면, 해당 정보를 참고하여 새로운 forest를 build (재현가능성을 위해 False로 고정)
    'class_weight': "balanced_subsample", # class 비율을 고려하여 sampling할 때 사용
    'ccp_alpha': 0.00045, # cost-complexity pruning에 사용할 alpha 값
    'max_samples': None, # sampling 할 데이터의 총 개수
    # 'monotonic_cst': None, # available at >= 1.4
}   

In [None]:
rfc_params_02 = {
    'n_estimators': 100, # rfc_params_01 대비 3배 증가
    'criterion': 'gini', 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1, 
    'min_weight_fraction_leaf': 0.0, 
    'max_features': None, 
    'max_leaf_nodes': None, 
    'min_impurity_decrease': 0.0, 
    'bootstrap': True, 
    'oob_score': True, 
    'n_jobs': None, 
    'random_state': hparams['seed'], 
    'verbose': 1, 
    'warm_start': False, 
    'class_weight': "balanced_subsample", 
    'ccp_alpha': 0.00045, 
    'max_samples': None, 
}   

In [None]:
rfc_params_03 = {
    'n_estimators': 100, 
    'criterion': 'gini', 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1, 
    'min_weight_fraction_leaf': 0.0, 
    'max_features': 'sqrt', # rfc_params_02: None에서 'sqrt'로 변경
    'max_leaf_nodes': None, 
    'min_impurity_decrease': 0.0, 
    'bootstrap': True, 
    'oob_score': True, 
    'n_jobs': None, 
    'random_state': hparams['seed'], 
    'verbose': 1, 
    'warm_start': False, 
    'class_weight': "balanced_subsample", 
    'ccp_alpha': 0.00045, 
    'max_samples': None, 
}   

In [None]:
rfc_params_04 = {
    'n_estimators': 100, 
    'criterion': 'gini', 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1, 
    'min_weight_fraction_leaf': 0.0, 
    'max_features': None,
    'max_leaf_nodes': None, 
    'min_impurity_decrease': 0.0, 
    'bootstrap': True, 
    'oob_score': True, 
    'n_jobs': None, 
    'random_state': hparams['seed'], 
    'verbose': 1, 
    'warm_start': False, 
    'class_weight': "balanced_subsample", 
    'ccp_alpha': 0.0005, # rfc_params_02 대비 0.00005 증가
    'max_samples': None, 
}   

In [None]:
rfc_params_05 = {
    'n_estimators': 100, 
    'criterion': 'gini', 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1, 
    'min_weight_fraction_leaf': 0.0, 
    'max_features': None,
    'max_leaf_nodes': None, 
    'min_impurity_decrease': 0.0, 
    'bootstrap': True, 
    'oob_score': True, 
    'n_jobs': None, 
    'random_state': hparams['seed'], 
    'verbose': 1, 
    'warm_start': False, 
    'class_weight': "balanced_subsample", 
    'ccp_alpha': 0.0004, # rfc_params_02 대비 0.00005 감소
    'max_samples': None, 
}   

In [None]:
rfc = RandomForestClassifier(**rfc_params_05)

**Training**

In [None]:
rfc.fit(x_tr.fillna(0), y_tr)

In [None]:
fi = rfc.feature_importances_
fi = pd.Series(fi, index=x_tr.columns).sort_values(ascending=False)

sns.barplot(x=fi, y=fi.index)

**Make a submission file**

In [None]:
x_tt = x_tt.drop(['is_converted', 'id'], axis=1)

In [None]:
y_test_pred = rfc.predict(x_tt.fillna(0))
sum(y_test_pred) # ???

In [None]:
model_name = 'randomForest_params05'

In [None]:
make_submission(dir_name='03_random_forest',
                y_pred=y_test_pred,
                model_name=model_name)

**Record**

In [None]:
record_experimental_results(model_name=model_name,
                            test_f1_score='0.7178936055883933',
                            description='randomForest_params02 실험에서 ccp_alpha 값을 0.00005 만큼 감소')

---

### 실험 02: Skew data transformation

In [None]:
tr_data, tt_data = pp.load_data()
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)

In [None]:
x_tr['lead_desc_length'] = np.log1p(x_tr['lead_desc_length'])
x_tt['lead_desc_length'] = np.log1p(x_tt['lead_desc_length'])

In [None]:
y_tr = x_tr['is_converted']
x_tr = x_tr.drop(columns=['is_converted'])

In [None]:
rfc_params_05 = {
    'n_estimators': 100, 
    'criterion': 'gini', 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1, 
    'min_weight_fraction_leaf': 0.0, 
    'max_features': None,
    'max_leaf_nodes': None, 
    'min_impurity_decrease': 0.0, 
    'bootstrap': True, 
    'oob_score': True, 
    'n_jobs': None, 
    'random_state': hparams['seed'], 
    'verbose': 1, 
    'warm_start': False, 
    'class_weight': "balanced_subsample", 
    'ccp_alpha': 0.0004, # rfc_params_02 대비 0.00005 감소
    'max_samples': None, 
}   

In [None]:
rfc = RandomForestClassifier(**rfc_params_05)

**Training**

In [None]:
rfc.fit(x_tr.fillna(0), y_tr)

In [None]:
fi = rfc.feature_importances_
fi = pd.Series(fi, index=x_tr.columns).sort_values(ascending=False)

sns.barplot(x=fi, y=fi.index)

**Make a submission file**

In [None]:
x_tt = x_tt.drop(['is_converted', 'id'], axis=1)

In [None]:
x_tr['is_converted'] = y_tr
sns.kdeplot(x_tr[x_tr['is_converted'] == True]['lead_desc_length'])
sns.kdeplot(x_tr[x_tr['is_converted'] == False]['lead_desc_length'])

In [None]:
y_test_pred = rfc.predict(x_tt.fillna(0))
sum(y_test_pred) # ???

In [None]:
model_name = 'randomForest_params05_logTransform'

In [None]:
make_submission(dir_name='03_random_forest',
                y_pred=y_test_pred,
                model_name=model_name)

**Record**

In [None]:
record_experimental_results(model_name=model_name,
                            test_f1_score='0.7178936055883933',
                            description='randomForest_params05 실험에서 lead_desc_length feature에 log transformation 적용')

---