In [None]:
# decision tree
# bagging
    # random forests
# boosting
    # xgboost
    # lightgbm

# common libraries
import os
import gc
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

import warnings; warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from collections import Counter

# Decision Tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree # for rule extraction
# Random Forests
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
# XGBoost
from xgboost import XGBClassifier, XGBRegressor
# LightGBM
from lightgbm import LGBMClassifier, LGBMRegressor

# data
data = pd.read_csv()

In [None]:
# data preparation
Y = data[]
X = data.drop(columns=[])

print('Y shape : ', Y.shape)
print('X shape : ', X.shape)

# spliting data
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=.3, random_state=)
print('# of train data : ', len(train_idx))
print('# of valid data : ', len(valid_idx))
print('# of train data Y : ', Counter(Y.iloc[train_idx]))
print('# of valid data Y : ', Counter(Y.iloc[valid_idx]))

In [None]:
# -- decision tree --
# max_depth : 2~5
# f1_score가 떨어지면 overfit 암시

# Depth 조절 Decision Tree
# f1_score가 떨어지면 overfitting되었다는 소리

for i in range(2, 5+1, 1):
    print('--- depth {} ---'.format(i))

    model = DecisionTreeClassifier(max_depth=i, criterion='gini')
    model.fit(X.iloc[train_idx], Y.iloc[train_idx])

    # Train Acc
    y_pre_train = model.predict(X.iloc[train_idx])
    cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
    print('Train Confusion Matrix')
    print(cm_train)
    print('Train Acc : {}'.format((cm_train[0, 0] + cm_train[1,1])/cm_train.sum()))
    print('Train F1 score : {}\n'.format(f1_score(Y.iloc[train_idx], y_pre_train)))

    # Test Acc
    y_pre_train = model.predict(X.iloc[valid_idx])
    cm_train = confusion_matrix(Y.iloc[valid_idx], y_pre_train)
    print('Valid Confusion Matrix')
    print(cm_train)
    print('Valid Acc : {}'.format((cm_train[0, 0] + cm_train[1,1])/cm_train.sum()))
    print('Valid F1 score : {}\n'.format(f1_score(Y.iloc[valid_idx], y_pre_train)))

# best model
best_model = DecisionTreeClassifier(max_depth=, criterion='gini')
best_model.fit(X.iloc[train_idx], Y,iloc[train_idx])

# tree plot
plt.rcParams['figure.figsize'] = [20,10]
tree.plot_tree(model, filled=True, feature_names=X.columns,
               class_names = [])

In [None]:
# -- random forests --

# parameters
    # n_estimators : # of trees
    # max_depth
    # criterion : gini, entropy, log_loss
    # min_samples_split : 2개 이상으로 바꿀 수 있음
    # bootstrap
    # max_features : auto, sqrt, log2
    # oob_score : out-of-bag score
    # class_weight : label imbalance 데이터 학습 시 weight 조절
    # random_state

estimators = [10, 30, 40, 50, 60]
depth = [4, 5, 10, 15]

# modeling
save_est = []
save_dep = []
f1_score_ = []

cnt = 0
for est in estimators:
    for dep in depth:
        print('--- cnt {} ---'.format(cnt))
        cnt+=1
        print('Number of Estimators : {}, Max Depth : {}'.format(est,dep))

        model = RandomForestClassifier(n_estimators=est,
                                      max_depth=dep,
                                      random_state=,
                                      criterion='gini',
                                      max_features='auto',
                                      bootstrap=True,
                                      oob_score=True) # oob_score=True -> longer time for training
        model.fit(X.iloc[train_idx], Y.iloc[train_idx])

        # Train Acc
        y_pre_train = model.predict(X.iloc[train_idx])
        cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
        print('Train Confusion Matrix')
        print(cm_train)
        print('Train Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
        print('Train F1 score : {}'.format(f1_score(Y.iloc[train_idx], y_pre_train)))

        # Test Acc
        y_pre_test = model.predict(X.iloc[valid_idx])
        cm_train = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
        print('Valid Confusion Matrix')
        print(cm_train)
        print('Valid Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
        print('Valid F1 score : {}\n'.format(f1_score(Y.iloc[valid_idx], y_pre_test)))

        save_est.append(est)
        save_dep.append(dep)
        f1_score_.append(f1_score(Y.iloc[valid_idx], y_pre_test))

In [None]:
# -- AdaBoost --

# parameters
    # n_estimators : # of trees
    # learning_rate : learning_rate와 n_estimators는 trade-off 관계에 있음

estimators = [70, 90, 100]
learning_rate = [.01, .03, .05, .1, .5]

# modeling
save_est = []
save_lr = []
f1_score_ = []

cnt = 0
for est in estimators:
    for lr in learning_rate:
        print('>>> cnt {} <<<'.format(cnt))
        cnt+=1
        print('Number of Estimators : {}, Max Depth : {}'.format(est, lr))

        model = AdaBoostClassifier(n_estimators=est, learning_rate=lr, random_state=119)
        model.fit(X.iloc[train_idx], Y.iloc[train_idx])

        # Train Acc
        y_pre_train = model.predict(X.iloc[train_idx])
        cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
        print('Train Confusion Matrix')
        print(cm_train)
        print('Train Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
        print('Train F1 score : {}\n'.format(f1_score(Y.iloc[train_idx], y_pre_train)))

        # Test Acc
        y_pre_test = model.predict(X.iloc[valid_idx])
        cm_train = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
        print('Valid Confusion Matrix')
        print(cm_train)
        print('Valid Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
        print('Valid F1 score : {}'.format(f1_score(Y.iloc[valid_idx], y_pre_test)))
        print('-'*60)

        save_est.append(est)
        save_lr.append(lr)
        f1_score_.append(f1_score(Y.iloc[valid_idx], y_pre_test))

In [None]:
# -- gradient boosting --

# parameters
    # n_estimators : # of trees
    # learning_rate : n_estimators와 trade-off에 있음
    # max_features : feature수 sampling
    # subsample : data subsample (bootstrap X)
    # max_depth : tree 최대 깊이 제한

estimators = [10, 20, 50]
learning_rate = [.05, .1, .5]
subsample = [.5, .75, 1]

# modeling
save_est = []
save_lr = []
save_sub = []
f1_score_ = []

cnt = 0
for est in estimators:
    for lr in learning_rate:
        for sub in subsample:
            print('>>> cnt {} <<<'.format(cnt))
            cnt+=1
            print('Number of Estimators : {}, Learning Rate : {}, Subsample : {}'.format(est, lr, sub))

            model = GradientBoostingClassifier(n_estimators=est,
                                               learning_rate=lr,
                                               subsample=sub,
                                               random_state=)
            model.fit(X.iloc[train_idx], Y.iloc[train_idx])

            # Train Acc
            y_pre_train = model.predict(X.iloc[train_idx])
            cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
            print('Train Confusion Matrix')
            print(cm_train)
            print('Train Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
            print('Train F1 score : {}\n'.format(f1_score(Y.iloc[train_idx], y_pre_train)))

            # Test Acc
            y_pre_test = model.predict(X.iloc[valid_idx])
            cm_train = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
            print('Valid Confusion Matrix')
            print(cm_train)
            print('Valid Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
            print('Valid F1 score : {}'.format(f1_score(Y.iloc[valid_idx], y_pre_test)))
            print('-'*60)

            save_est.append(est)
            save_lr.append(lr)
            save_sub.append(sub)
            f1_score_.append(f1_score(Y.iloc[valid_idx], y_pre_test))

In [None]:
# -- xgboost --

# objective[default=reg:linear]
    # binary:logistic
    # multi:softmax
    # multi:softprob
# hyperparameters tuning
    # n_estimators, learning_rate, max_depth, reg_alpha

n_tree = [5, 10, 20]
l_rate = [.1, .3]
m_depth = [3, 5]
L1_norm = [.1, .3, .5]

# modeling
save_n = []
save_l = []
save_m = []
save_L1 = []
f1_score_ = []

cnt = 0
for n in n_tree:
    for l in l_rate:
        for m in m_depth:
            for L1 in L1_norm:
                print('--- cnt {} ---'.format(cnt))
                cnt+=1
                print('Number of Estimators : {}, Learning Rate : {}, '\
                      'Max Depth : {}, L1 Norm : {}'.format(n, l, m, L1))

                model = XGBClassifier(n_estimators=n,
                                     learning_rate=l,
                                     max_depth=m,
                                     reg_alpha=L1,
                                     objective=, # dependent to task
                                     random_state=)
                model.fit(X.iloc[train_idx], Y.iloc[train_idx])

                # Train Acc
                y_pre_train = model.predict(X.iloc[train_idx])
                cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
                print('Train Confusion Matrix')
                print(cm_train)
                print('Train Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
                print('Train F1 score : {}\n'.format(f1_score(Y.iloc[train_idx], y_pre_train)))

                # Test Acc
                y_pre_test = model.predict(X.iloc[valid_idx])
                cm_train = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
                print('Valid Confusion Matrix')
                print(cm_train)
                print('Valid Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
                print('Valid F1 score : {}'.format(f1_score(Y.iloc[valid_idx], y_pre_test)))
                print('-'*60)

                save_n.append(n)
                save_l.append(l)
                save_m.append(m)
                save_L1.append(L1)
                f1_score_.append(f1_score(Y.iloc[valid_idx], y_pre_test))

                # saving model
                # joblib.dump(model, 'sample_data/XGBoost_model/Result_{}_{}_{}_{}_{}.pkl'.format(n, l, m, L1, round(f1_score_[-1], 4)))
                # gc.collect()

In [None]:
# -- lightgbm --
n_tree = [5, 10, 20]
l_rate = [.1, .3]
m_depth = [3, 5]
L1_norm = [.1, .3, .5]

# modeling
save_n = []
save_l = []
save_m = []
save_L1 = []
f1_score_ = []

cnt = 0
for n in n_tree:
    for l in l_rate:
        for m in m_depth:
            for L1 in L1_norm:
                print('>>> cnt {} <<<'.format(cnt))
                cnt+=1
                print('Number of Estimators : {}, Learning Rate : {}, '\
                      'Max Depth : {}, L1 Norm : {}'.format(n, l, m, L1))

                model = LGBMClassifier(n_estimators=n,
                                      learning_rate=l,
                                      max_depth=m,
                                      reg_alpha=L1,
                                      objective='cross_entropy', # 설정 필요
                                      n_jobs=-1,
                                      random_state=119)
                model.fit(X.iloc[train_idx], Y.iloc[train_idx])

                # Train Acc
                y_pre_train = model.predict(X.iloc[train_idx])
                cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
                print('Train Confusion Matrix')
                print(cm_train)
                print('Train Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
                print('Train F1 score : {}\n'.format(f1_score(Y.iloc[train_idx], y_pre_train)))

                # Test Acc
                y_pre_test = model.predict(X.iloc[valid_idx])
                cm_train = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
                print('Valid Confusion Matrix')
                print(cm_train)
                print('Valid Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
                print('Valid F1 score : {}'.format(f1_score(Y.iloc[valid_idx], y_pre_test)))
                print('-'*60)

                save_n.append(n)
                save_l.append(l)
                save_m.append(m)
                save_L1.append(L1)
                f1_score_.append(f1_score(Y.iloc[valid_idx], y_pre_test))

                # saving model
                # joblib.dump(model, 'sample_data/XGBoost_model/Result_{}_{}_{}_{}_{}.pkl'.format(n, l, m, L1, round(f1_score_[-1], 4)))
                # gc.collect()

In [None]:
# -- getting best model & plotting feature importances --

# choosing best model
# random forests
best_model = RandomForestClassifier(n_estimators=save_est[np.argmax(f1_score_)],
                                    max_depth=save_dep[np.argmax(f1_score_)],
                                    random_state=119,
                                    criterion='gini',
                                    max_features='auto',
                                    bootstrap=True,
                                    oob_score=False)
# adaboost
best_model = AdaBoostClassifier(n_estimators=save_est[np.argmax(f1_score_)],
                                learning_rate=save_lr[np.argmax(f1_score_)],
                                random_state=)
# gradient boosting
best_model = GradientBoostingClassifier(n_estimators=save_est[np.argmax(f1_score_)],
                                        learning_rate=save_lr[np.argmax(f1_score_)],
                                        subsample=save_sub[np.argmax(f1_score_)],
                                        random_state=)
# xgboost
best_model = XGBClassifier(n_estimators=save_n[np.argmax(f1_score_)],
                        learning_rate=save_l[np.argmax(f1_score_)],
                        max_depth=save_m[np.argmax(f1_score_)],
                        reg_alpha=save_L1[np.argmax(f1_score_)],
                        objective=,
                        random_state=)
# lightgbm
best_model = LGBMClassifier(n_estimators=save_n[np.argmax(f1_score_)],
                            learning_rate=save_l[np.argmax(f1_score_)],
                            max_depth=save_m[np.argmax(f1_score_)],
                            reg_alpha=save_L1[np.argmax(f1_score_)],
                            objective='cross_entropy', # 설정 필요
                            n_jobs=-1,
                            random_state=)


# fitting best model
best_model.fit(X.iloc[train_idx], Y.iloc[train_idx])

# Train Acc
y_pre_train = best_model.predict(X.iloc[train_idx])
cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
print('Train Confusion Matrix')
print(cm_train)
print('Train Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
print('Train F1 score : {}'.format(f1_score(Y.iloc[train_idx], y_pre_train)))

# Test Acc
y_pre_test = best_model.predict(X.iloc[valid_idx])
cm_train = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
print('Valid Confusion Matrix')
print(cm_train)
print('Valid Acc : {}'.format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
print('Valid F1 score : {}\n'.format(f1_score(Y.iloc[valid_idx], y_pre_test)))

# feature importance
feature_map = pd.DataFrame(sorted(zip(best_model.feature_importances_, X.columns), reverse=True), columns=['Score', 'Feature'])
print(feature_map)

# feature map
feature_map_20 = feature_map.iloc[:10]
plt.figure(figsize=(20,10))
sns.barplot(x='Score', y='Feature',
            data=feature_map_20.sort_values(by='Score', ascending=False),
            errwidth=40
            )
plt.tight_layout()
plt.show()