In [3]:
# !git clone --recursive https://github.com/Microsoft/LightGBM
# ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
from google.colab import drive
drive.mount('/content/drive')

!pip uninstall lightgbm
!pip install lightgbm

import os
import sys
import time
import random
import logging
import typing as tp
from pathlib import Path
from contextlib import contextmanager

from matplotlib import pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
!pip install catboost
!pip install category_encoders
import category_encoders as ce
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error, mean_squared_error

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoost, Pool

%matplotlib inline

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Uninstalling lightgbm-3.1.1:
  Would remove:
    /usr/local/lib/python3.6/dist-packages/lightgbm-3.1.1.dist-info/*
    /usr/local/lib/python3.6/dist-packages/lightgbm/*
Proceed (y/n)? y
  Successfully uninstalled lightgbm-3.1.1
Collecting lightgbm
  Using cached https://files.pythonhosted.org/packages/70/cd/2b7783e8c250f8191b72e9a0010e0429a799d3305c27764d7bf113dfd078/lightgbm-3.1.1-py2.py3-none-manylinux1_x86_64.whl
Installing collected packages: lightgbm
Successfully installed lightgbm-3.1.1


Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/20/37/bc4e0ddc30c07a96482abf1de7ed1ca54e59bba2026a33bca6d2ef286e5b/catboost-0.24.4-cp36-none-manylinux1_x86_64.whl (65.7MB)
[K     |████████████████████████████████| 65.8MB 97kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.4


In [13]:
path = '/content/drive/Shareddrives/dacon/Playground/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
smpl_sub = pd.read_csv(path + 'sample_submission.csv')

In [5]:
train.head().T

id,1,2,3,4,6
cat0,A,B,A,A,A
cat1,B,A,A,A,B
cat2,A,A,A,A,A
cat3,A,A,C,C,A
cat4,B,B,B,B,B
cat5,D,B,D,D,B
cat6,A,A,A,A,A
cat7,E,E,B,E,E
cat8,C,A,C,G,C
cat9,I,F,N,K,F


In [25]:
@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time.time()
    yield
    d = time.time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)

In [9]:
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

In [10]:
class TreeModel:
    """Wrapper for LightGBM/XGBoost/CATBoost"""
    def __init__(self, model_type: str):
        self.model_type = model_type
        self.trn_data = None
        self.val_data = None
        self.model = None

    def train(self,
              params: dict,
              X_train: pd.DataFrame, y_train: np.ndarray,
              X_val: pd.DataFrame, y_val: np.ndarray,
              train_weight: tp.Optional[np.ndarray] = None,
              val_weight: tp.Optional[np.ndarray] = None,
              train_params: dict = None,
              cat_cols: list = None,
            ):
        if self.model_type == "lgb":
            self.trn_data = lgb.Dataset(X_train, label=y_train, weight=train_weight)
            self.val_data = lgb.Dataset(X_val, label=y_val, weight=val_weight)
            self.model = lgb.train(params=params,
                                   train_set=self.trn_data,
                                   valid_sets=[self.trn_data, self.val_data],
                                   **train_params)
        elif self.model_type == "xgb":
            self.trn_data = xgb.DMatrix(X_train, y_train, weight=train_weight)
            self.val_data = xgb.DMatrix(X_val, y_val, weight=val_weight)
            self.model = xgb.train(params=params,
                                   dtrain=self.trn_data,
                                   evals=[(self.trn_data, "train"), (self.val_data, "val")],
                                   **train_params)
        elif self.model_type == "cat":
            self.trn_data = Pool(
                X_train, label=y_train, cat_features=cat_cols)  #, group_id=[0] * len(X_train))
            self.val_data = Pool(
                X_val, label=y_val, cat_features=cat_cols)  #, group_id=[0] * len(X_val))
            self.model = CatBoost(params)
            self.model.fit(
                self.trn_data, eval_set=[self.val_data], use_best_model=True, **train_params)
        else:
            raise NotImplementedError

    def predict(self, X: pd.DataFrame):
        if self.model_type == "lgb":
            return self.model.predict(
                X, num_iteration=self.model.best_iteration)  # type: ignore
        elif self.model_type == "xgb":
            X_DM = xgb.DMatrix(X)
            return self.model.predict(
                X_DM, ntree_limit=self.model.best_ntree_limit)  # type: ignore
        elif self.model_type == "cat":
            return self.model.predict(X)
        else:
            raise NotImplementedError

    @property
    def feature_names_(self):
        if self.model_type == "lgb":
            return self.model.feature_name()
        elif self.model_type == "xgb":
            return list(self.model.get_score(importance_type="gain").keys())
        elif self.model_type == "cat":
             return self.model.feature_names_
        else:
            raise NotImplementedError

    @property
    def feature_importances_(self):
        if self.model_type == "lgb":
            return self.model.feature_importance(importance_type="gain")
        elif self.model_type == "xgb":
            return list(self.model.get_score(importance_type="gain").values())
        elif self.model_type == "cat":
            return self.model.feature_importances_
        else:
            raise NotImplementedError

In [12]:
ID_COL = "id"
CAT_COLS= [f"cat{i}" for i in range(10)]
CONT_COLS = [f"cont{i}" for i in range(14)]
TGT_COL = "target"

N_SPLITS = 10
RANDOM_SEED_LIST = [
    42
]

In [14]:
use_feat_cols = []
train_feat = train[[ID_COL]].copy()
test_feat = test[[ID_COL]].copy()

In [15]:
ord_enc = ce.OrdinalEncoder(cols=CAT_COLS)
train_cat_feat = ord_enc.fit_transform(train[CAT_COLS])
test_cat_feat = ord_enc.transform(test[CAT_COLS])

  elif pd.api.types.is_categorical(cols):


In [16]:
train_feat = pd.concat([
    train_feat, train_cat_feat], axis=1)
test_feat = pd.concat([
    test_feat, test_cat_feat], axis=1)
use_feat_cols.extend(train_cat_feat.columns)

In [17]:
train_cont_feat = train[CONT_COLS]
test_cont_feat = test[CONT_COLS]

In [18]:
train_feat = pd.concat([
    train_feat, train_cont_feat], axis=1)
test_feat = pd.concat([
    test_feat, test_cont_feat], axis=1)
use_feat_cols.extend(CONT_COLS)

In [19]:
train_feat.head().T

Unnamed: 0,0,1,2,3,4
id,1.0,2.0,3.0,4.0,6.0
cat0,1.0,2.0,1.0,1.0,1.0
cat1,1.0,2.0,2.0,2.0,1.0
cat2,1.0,1.0,1.0,1.0,1.0
cat3,1.0,1.0,2.0,2.0,1.0
cat4,1.0,1.0,1.0,1.0,1.0
cat5,1.0,2.0,1.0,1.0,2.0
cat6,1.0,1.0,1.0,1.0,1.0
cat7,1.0,1.0,2.0,1.0,1.0
cat8,1.0,2.0,1.0,3.0,1.0


In [20]:
def run_train_and_inference(
    X, X_test, y, use_model, model_params, train_params, seed_list, n_splits, cat_cols=None
):
    
    oof_pred_arr = np.zeros(len(X))
    test_pred_arr = np.zeros(len(X_test))
    feature_importances = pd.DataFrame()
    score_list = []
    
    for seed in seed_list:
        if use_model == "cat":
            model_params['random_state'] = seed
        else:
            model_params["seed"] = seed
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        tmp_oof_pred = np.zeros(len(X))
        tmp_test_pred = np.zeros(len(X_test))

        for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
            print("*" * 100)
            print(f"Seed: {seed} - Fold: {fold}")
            X_trn = X.loc[trn_idx].reset_index(drop=True)
            X_val = X.loc[val_idx].reset_index(drop=True)
            y_trn = y[trn_idx]
            y_val = y[val_idx]

            model = TreeModel(model_type=use_model)
            with timer(prefix="Model training"):
                model.train(
                    params=model_params, X_train=X_trn, y_train=y_trn,
                    X_val=X_val, y_val=y_val, train_params=train_params, cat_cols=cat_cols
                )
            with timer(prefix="Get Feature Importance"):
                fi_tmp = pd.DataFrame()
                fi_tmp["feature"] = model.feature_names_
                fi_tmp["importance"] = model.feature_importances_
                fi_tmp["fold"] = fold
                fi_tmp["seed"] = seed
                feature_importances = feature_importances.append(fi_tmp)

            with timer(prefix="Predict Valid"):
                val_pred = model.predict(X_val)
                score = mean_squared_error(y_val, val_pred, squared=False)
                # score = rmse(y_val, val_pred)
                print(f"score: {score:.5f}")
                score_list.append([seed, fold, score])
                tmp_oof_pred[val_idx] = val_pred
                tmp_test_pred += model.predict(X_test)
            
        oof_score = mean_squared_error(y, tmp_oof_pred, squared=False)
        # oof_score = rmse(y, tmp_oof_pred)
        print(f"oof score: {oof_score: 5f}")
        score_list.append([seed, "oof", oof_score])

        oof_pred_arr += tmp_oof_pred
        test_pred_arr += tmp_test_pred / n_splits

    oof_pred_arr /= len(seed_list)
    test_pred_arr /= len(seed_list)
    
    oof_score = mean_squared_error(y, oof_pred_arr, squared=False)
    # oof_score = rmse(y, oof_pred_arr)
    score_list.append(["avg", "oof", oof_score])
    score_df = pd.DataFrame(
        score_list, columns=["seed", "fold", "rmse score"])
    
    return oof_pred_arr, test_pred_arr, score_df, feature_importances

In [21]:
X = train_feat[use_feat_cols]
X_test = test_feat[use_feat_cols]

y = train[TGT_COL].values

print(f"train_feat: {X.shape}, test_feat: {X_test.shape}")

train_feat: (300000, 24), test_feat: (200000, 24)


In [22]:
X_cat = X.copy()
X_cat[CAT_COLS] = train[CAT_COLS]
X_test_cat = X_test.copy()
X_test_cat = test[CAT_COLS]

In [30]:
MODEL_PARAMS = {
    "lgb": {
        "objective": "root_mean_squared_error",
        "boosting": "gbdt",
        "max_depth": 8,
        "learning_rate": 0.005,
        "colsample_bytree": 0.2,
        "subsample": 0.8,
        "subsample_freq": 6,
        "reg_alpha": 20,
        "min_data_in_leaf": 200,
        "n_jobs": 2,
        "seed": RANDOM_SEED_LIST[0],
        # "device": "gpu",
        # "gpu_device_id": 0
    },
    "xgb": {
        "objective": "reg:squarederror",
        "max_depth": 8,
        "learning_rate": 0.003,
        "colsample_bytree": 0.2,
        "subsample": 0.8,
        "reg_alpha" : 6,
        "min_child_weight": 200,
        "n_jobs": 2,
        "seed": RANDOM_SEED_LIST[0]
    },
    "cat": {
        'loss_function': 'RMSE',
        "max_depth": 4,
        'learning_rate': 0.03,
        "bootstrap_type": 'Poisson',
        "subsample": 0.8,
        "border_count": 512,
        "l2_leaf_reg": 200,
        'random_state': RANDOM_SEED_LIST[0],
        "thread_count": 2,
        'num_boost_round': 50000,
    }
}
TRAIN_PARAMS = {
    "lgb": {
        "num_boost_round": 50000,
        "early_stopping_rounds": 200,
        "verbose_eval": 200,
    },
    "xgb": {
        "num_boost_round": 50000,
        "early_stopping_rounds": 200,
        "verbose_eval":  200,
    },
    "cat": {
        'early_stopping_rounds': 200,
        'verbose_eval': 200,
    }
}

In [26]:
oof_pred_lgb, test_pred_lgb, score_lgb, feat_imps_lgb = run_train_and_inference(
    X, X_test, y, "lgb", MODEL_PARAMS["lgb"], TRAIN_PARAMS["lgb"], RANDOM_SEED_LIST, N_SPLITS)

****************************************************************************************************
Seed: 42 - Fold: 0
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3634
[LightGBM] [Info] Number of data points in the train set: 270000, number of used features: 24
[LightGBM] [Info] Start training from score 7.456253
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.872475	valid_1's rmse: 0.876481
[400]	training's rmse: 0.862942	valid_1's rmse: 0.867354
[600]	training's rmse: 0.856622	valid_1's rmse: 0.861418
[800]	training's rmse: 0.852254	valid_1's rmse: 0.857437
[1000]	training's rmse: 0.849097	valid_1's rmse: 0.854685
[1200]	training's rmse: 0.846832	valid_1's rmse: 0.852776
[1400]	training's rmse: 0.844987	valid_1's rmse: 0.851346
[1600]	training's rmse: 0.84349	valid_1's rmse: 0.850222
[1800]	training's rmse: 0.84222	valid_1's rmse: 0.849361
[2000]	training's rmse: 0.841109	valid_1's rmse: 0.848637
[2200

In [27]:
score_lgb

Unnamed: 0,seed,fold,rmse score
0,42,0,0.844935
1,42,1,0.838465
2,42,2,0.844663
3,42,3,0.842638
4,42,4,0.840566
5,42,5,0.843849
6,42,6,0.841348
7,42,7,0.841212
8,42,8,0.842389
9,42,9,0.838686


In [28]:
score_lgb.loc[score_lgb.fold == "oof"]

Unnamed: 0,seed,fold,rmse score
10,42,oof,0.841878
11,avg,oof,0.841878


In [None]:
oof_pred_xgb, test_pred_xgb, score_xgb, feat_imps_xgb = run_train_and_inference(
    X, X_test, y, "xgb", MODEL_PARAMS["xgb"], TRAIN_PARAMS["xgb"], RANDOM_SEED_LIST, N_SPLITS)

****************************************************************************************************
Seed: 42 - Fold: 0
[0]	train-rmse:6.99185	val-rmse:6.99238
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[200]	train-rmse:3.90343	val-rmse:3.9043
[400]	train-rmse:2.26111	val-rmse:2.2626
[600]	train-rmse:1.43599	val-rmse:1.43839
[800]	train-rmse:1.06726	val-rmse:1.07073
[1000]	train-rmse:0.925471	val-rmse:0.929951
[1200]	train-rmse:0.875766	val-rmse:0.881266
[1400]	train-rmse:0.857904	val-rmse:0.864366
[1600]	train-rmse:0.85058	val-rmse:0.85795
[1800]	train-rmse:0.846647	val-rmse:0.854929
[2000]	train-rmse:0.843979	val-rmse:0.853187
[2200]	train-rmse:0.841814	val-rmse:0.851929
[2400]	train-rmse:0.839899	val-rmse:0.850924
[2600]	train-rmse:0.838202	val-rmse:0.850107
[2800]	train-rmse:0.836677	val-rmse:0.849434
[3000]	train-rmse:0.835215	val-rmse:0.848862
[3200]	train-rmse:0.833859	val-rmse:0.8

In [None]:
submission.to_csv("new_bl_mine.csv", index = True)
!cp new_bl_mine.csv "drive/My Drive/"