# Setup

## Install packages

In [1]:
%%bash
pip uninstall lightgbm -y
pip install lightgbm --install-option=--gpu
pip install catboost

Found existing installation: lightgbm 2.2.3
Uninstalling lightgbm-2.2.3:
  Successfully uninstalled lightgbm-2.2.3
Collecting lightgbm
  Downloading lightgbm-3.3.2.tar.gz (1.5 MB)
Skipping wheel build for lightgbm, due to binaries being disabled for it.
Installing collected packages: lightgbm
    Running setup.py install for lightgbm: started
    Running setup.py install for lightgbm: finished with status 'done'
Successfully installed lightgbm-3.3.2
Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
Installing collected packages: catboost
Successfully installed catboost-1.0.4


  cmdoptions.check_install_build_global(options)


In [2]:
import lightgbm as lgb
import catboost as cat
print(f"LightGBM version: {lgb.__version__}")
print(f"Catboost version: {cat.__version__}")

LightGBM version: 3.3.2
Catboost version: 1.0.4


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

## Data and Helpers

In [4]:
%%bash
cp /content/drive/MyDrive/Colab\ Notebooks/projects/bulldozers/data/*.feather /content
cp /content/drive/MyDrive/pyism/{utils,hiwamari,mrmr}.py /content

In [5]:
xtrn = pd.read_feather("xtrn.feather")
xval = pd.read_feather("xval.feather")
ytrn = pd.read_feather("ytrn.feather").iloc[:,0]
yval = pd.read_feather("yval.feather").iloc[:,0]

In [6]:
xtrn.shape, ytrn.shape

((401125, 99), (401125,))

In [7]:
xval.shape, yval.shape

((11573, 99), (11573,))

## Metric

In [8]:
# yh - yhat: predicted values
# root-mean-squared error
scores = dict()
def rmse(y, yh):
    return np.sqrt(np.mean((y - yh)**2))
 
def evaluate(model, name, x=xval, y=yval):
    yh = model.predict(x.values)
    s = rmse(y.values, yh)
    scores[name] = s
    return s

# Modeling

## Bagging

In [9]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

### `ExtraTrees`

In [10]:
extra_trees = ExtraTreesRegressor(n_estimators=100, max_features=.5, 
                                  min_samples_leaf=3, min_samples_split=5,
                                  n_jobs=-1)

In [11]:
extra_trees.fit(xtrn.values, ytrn.values)

ExtraTreesRegressor(max_features=0.5, min_samples_leaf=3, min_samples_split=5,
                    n_jobs=-1)

In [12]:
evaluate(extra_trees, 'extra_trees')

0.2227654047413686

### `RandomForest`



In [13]:
random_forest = RandomForestRegressor(n_estimators=100, max_features=0.5, max_samples=0.5, 
                                      min_samples_leaf=3, min_samples_split=5, n_jobs=-1)

In [14]:
random_forest.fit(xtrn.values, ytrn.values)

RandomForestRegressor(max_features=0.5, max_samples=0.5, min_samples_leaf=3,
                      min_samples_split=5, n_jobs=-1)

In [15]:
evaluate(random_forest,'random_forest')

0.22623578231355285

## Boosting

### Sklearn's Gradient Boost

In [16]:
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor

#### Vanila

In [17]:
vanila_params = dict(
    loss='squared_error', learning_rate=0.05, n_estimators=500, 
    subsample=.7, max_features="log2",
    min_samples_leaf=9, max_depth=16, 
    verbose=1, validation_fraction=0.1, n_iter_no_change=20,
)

In [18]:
vanila = GradientBoostingRegressor(**vanila_params, random_state=0)
vanila.fit(xtrn.values, ytrn.values)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.4441           0.0365            5.03m
         2           0.4120           0.0319            4.95m
         3           0.3824           0.0298            4.83m
         4           0.3552           0.0265            4.83m
         5           0.3306           0.0246            4.83m
         6           0.3093           0.0211            4.79m
         7           0.2886           0.0200            4.84m
         8           0.2694           0.0192            4.82m
         9           0.2519           0.0174            4.78m
        10           0.2363           0.0153            4.76m
        20           0.1346           0.0064            4.64m
        30           0.0914           0.0028            4.50m
        40           0.0692           0.0017            4.47m
        50           0.0584           0.0010            4.37m
        60           0.0515           0.0006            4.26m
       

GradientBoostingRegressor(learning_rate=0.05, max_depth=16, max_features='log2',
                          min_samples_leaf=9, n_estimators=500,
                          n_iter_no_change=20, random_state=0, subsample=0.7,
                          verbose=1)

In [19]:
evaluate(vanila, 'vanila')

0.22125514838021532

#### Histogram-based 

In [20]:
histo_params = vanila_params.copy()
[histo_params.pop(k) for k in ['n_estimators', 'subsample', 'max_features']]
histo_params.update(
    dict(
        max_iter=2000,
        learning_rate=.1,
        min_samples_leaf=15, 
        l2_regularization=3.75,
        verbose=500
    )
)

In [21]:
histo = HistGradientBoostingRegressor(**histo_params, random_state=0)
histo.fit(xtrn.values,ytrn.values)

Binning 0.286 GB of training data: 1.135 s
Binning 0.032 GB of validation data: 0.056 s
Fitting gradient boosted rounds:
[1/2000] 1 tree, 31 leaves, max depth = 7, train loss: 0.20284, val loss: 0.20309, in 0.120s
[2/2000] 1 tree, 31 leaves, max depth = 7, train loss: 0.17220, val loss: 0.17229, in 0.121s
[3/2000] 1 tree, 31 leaves, max depth = 7, train loss: 0.14725, val loss: 0.14724, in 0.124s
[4/2000] 1 tree, 31 leaves, max depth = 7, train loss: 0.12678, val loss: 0.12667, in 0.119s
[5/2000] 1 tree, 31 leaves, max depth = 7, train loss: 0.11017, val loss: 0.11002, in 0.119s
[6/2000] 1 tree, 31 leaves, max depth = 7, train loss: 0.09653, val loss: 0.09631, in 0.123s
[7/2000] 1 tree, 31 leaves, max depth = 7, train loss: 0.08545, val loss: 0.08520, in 0.119s
[8/2000] 1 tree, 31 leaves, max depth = 7, train loss: 0.07629, val loss: 0.07600, in 0.137s
[9/2000] 1 tree, 31 leaves, max depth = 7, train loss: 0.06879, val loss: 0.06847, in 0.114s
[10/2000] 1 tree, 31 leaves, max depth = 7

HistGradientBoostingRegressor(l2_regularization=3.75, max_depth=16,
                              max_iter=2000, min_samples_leaf=15,
                              n_iter_no_change=20, random_state=0, verbose=500)

In [22]:
evaluate(histo, 'histo')

0.22268901301713492

### LightGBM

#### GBDT - Gradient Boosting Decision Tree

In [23]:
gbdt_params = dict(
    objective='regression', metric='rmse', 
    boosting='gbdt', n_estimators=5000, learning_rate=.1, 
    max_depth=20, min_data_in_leaf=35, feature_fraction=.7,
    early_stopping_rounds=500, device_type='gpu',
    lambda_l1=3, lambda_l2=2,
)

In [24]:
gbdt = lgb.LGBMRegressor(**gbdt_params, n_jobs=-1)
gbdt.fit(xtrn.values, ytrn.values,
         eval_set=(xval.values, yval.values),
         eval_metric='rmse', callbacks=[lgb.log_evaluation(500)])

[500]	valid_0's rmse: 0.220559
[1000]	valid_0's rmse: 0.218114
[1500]	valid_0's rmse: 0.217086
[2000]	valid_0's rmse: 0.21728


LGBMRegressor(boosting='gbdt', device_type='gpu', early_stopping_rounds=500,
              feature_fraction=0.7, lambda_l1=3, lambda_l2=2, max_depth=20,
              metric='rmse', min_data_in_leaf=35, n_estimators=5000,
              objective='regression')

In [25]:
evaluate(gbdt, 'gbdt')

0.21698090573203388

#### GOSS - Gradient One-Side Sampling

In [26]:
gbdt

LGBMRegressor(boosting='gbdt', device_type='gpu', early_stopping_rounds=500,
              feature_fraction=0.7, lambda_l1=3, lambda_l2=2, max_depth=20,
              metric='rmse', min_data_in_leaf=35, n_estimators=5000,
              objective='regression')

In [27]:
goss_params = gbdt_params.copy()
goss_params.update(
    dict(
        lambda_l1=5, lambda_l2=3,
        boosting='goss', 
        top_rate=.85, other_rate=.085
    )
)

In [28]:
goss = lgb.LGBMRegressor(**goss_params, n_jobs=-1)
goss.fit(xtrn.values, ytrn,
         eval_set=(xval.values, yval),
         eval_metric='rmse', callbacks=[lgb.log_evaluation(500)])


[500]	valid_0's rmse: 0.221294
[1000]	valid_0's rmse: 0.218807
[1500]	valid_0's rmse: 0.218129
[2000]	valid_0's rmse: 0.217251
[2500]	valid_0's rmse: 0.218193


LGBMRegressor(boosting='goss', device_type='gpu', early_stopping_rounds=500,
              feature_fraction=0.7, lambda_l1=5, lambda_l2=3, max_depth=20,
              metric='rmse', min_data_in_leaf=35, n_estimators=5000,
              objective='regression', other_rate=0.085, top_rate=0.85)

In [29]:
evaluate(goss, 'goss')

0.21718737072615607

#### DART - Dropouts meet Multiple Additive Regression Trees

In [30]:
np.arange(0, 1500, 200) * .2

array([  0.,  40.,  80., 120., 160., 200., 240., 280.])

In [31]:
dart_params = dict(
    n_estimators=1500,
    bagging=.7, bagging_freq=5, 
    boosting='dart', skip_drop=.8, drop_rate=.2,
    learning_rate=.15, device_type='gpu'
)

In [32]:
dart = lgb.LGBMRegressor(**dart_params)
dart.fit(xtrn.values, ytrn,
         eval_set=(xval.values, yval),
         eval_metric='rmse', callbacks=[lgb.log_evaluation(100)])


[100]	valid_0's rmse: 0.524319	valid_0's l2: 0.27491
[200]	valid_0's rmse: 0.242334	valid_0's l2: 0.0587258
[300]	valid_0's rmse: 0.24293	valid_0's l2: 0.0590149
[400]	valid_0's rmse: 0.22073	valid_0's l2: 0.0487219
[500]	valid_0's rmse: 0.246653	valid_0's l2: 0.0608375
[600]	valid_0's rmse: 0.232447	valid_0's l2: 0.0540318
[700]	valid_0's rmse: 0.225686	valid_0's l2: 0.0509341
[800]	valid_0's rmse: 0.263567	valid_0's l2: 0.0694677
[900]	valid_0's rmse: 0.219482	valid_0's l2: 0.0481724
[1000]	valid_0's rmse: 0.218368	valid_0's l2: 0.0476844
[1100]	valid_0's rmse: 0.219795	valid_0's l2: 0.0483097
[1200]	valid_0's rmse: 0.218403	valid_0's l2: 0.0477
[1300]	valid_0's rmse: 0.246175	valid_0's l2: 0.0606023
[1400]	valid_0's rmse: 0.218829	valid_0's l2: 0.0478863
[1500]	valid_0's rmse: 0.219894	valid_0's l2: 0.0483534


LGBMRegressor(bagging=0.7, bagging_freq=5, boosting='dart', device_type='gpu',
              drop_rate=0.2, learning_rate=0.15, n_estimators=1500,
              skip_drop=0.8)

In [33]:
evaluate(dart, 'dart')

0.21989399270551177

### Catboost

#### Plain Boosting

In [34]:
plain_params = dict(
    boosting_type='Plain', loss_function='RMSE',
    eval_metric='RMSE', n_estimators=1700, learning_rate=.0955,
    bootstrap_type='Bernoulli', subsample=.8, 
    sampling_frequency='PerTree', max_depth=10,
    early_stopping_rounds=500, l2_leaf_reg=4.,
    grow_policy='SymmetricTree', # rsm=.8,
    task_type='GPU'
)

In [35]:
plain = cat.CatBoostRegressor(**plain_params)
plain.fit(xtrn.values, ytrn.values,
          eval_set=[(xval.values,yval.values)],
          verbose=500)

0:	learn: 0.6415431	test: 0.6825166	best: 0.6825166 (0)	total: 11ms	remaining: 18.7s
500:	learn: 0.1945893	test: 0.2220736	best: 0.2220736 (500)	total: 5.12s	remaining: 12.2s
1000:	learn: 0.1809045	test: 0.2210435	best: 0.2206357 (822)	total: 10.3s	remaining: 7.19s
bestTest = 0.2206356644
bestIteration = 822
Shrink model to first 823 iterations.


<catboost.core.CatBoostRegressor at 0x7fd3a9805b50>

In [36]:
evaluate(plain, 'plain')

0.22063449773087107

#### Ordered Boosting

In [37]:
ordered_params = plain_params.copy()
ordered_params.pop('subsample')
ordered_params.update(
    dict(
        n_estimators=3000, learning_rate=.09125,
        boosting_type='Ordered',
        bootstrap_type='Bayesian',  bagging_temperature=1,
    )
)

In [38]:
ordered = cat.CatBoostRegressor(**ordered_params)
ordered.fit(xtrn.values, ytrn,
          eval_set=[(xval.values,yval)],
          verbose=500)

0:	learn: 0.6436996	test: 0.6850129	best: 0.6850129 (0)	total: 112ms	remaining: 5m 36s
500:	learn: 0.2036530	test: 0.2237036	best: 0.2235208 (484)	total: 47.7s	remaining: 3m 57s
1000:	learn: 0.1971936	test: 0.2216782	best: 0.2216782 (1000)	total: 1m 31s	remaining: 3m 3s
1500:	learn: 0.1947656	test: 0.2211298	best: 0.2211273 (1499)	total: 2m 13s	remaining: 2m 12s
2000:	learn: 0.1931106	test: 0.2205737	best: 0.2205116 (1940)	total: 2m 55s	remaining: 1m 27s
2500:	learn: 0.1917359	test: 0.2201448	best: 0.2201335 (2499)	total: 3m 38s	remaining: 43.5s
2999:	learn: 0.1906247	test: 0.2199335	best: 0.2197736 (2873)	total: 4m 20s	remaining: 0us
bestTest = 0.2197735559
bestIteration = 2873
Shrink model to first 2874 iterations.


<catboost.core.CatBoostRegressor at 0x7fd366942490>

In [39]:
evaluate(ordered, 'ordered')

0.21977943878522724

## Review 

In [40]:
scores

{'dart': 0.21989399270551177,
 'extra_trees': 0.2227654047413686,
 'gbdt': 0.21698090573203388,
 'goss': 0.21718737072615607,
 'histo': 0.22268901301713492,
 'ordered': 0.21977943878522724,
 'plain': 0.22063449773087107,
 'random_forest': 0.22623578231355285,
 'vanila': 0.22125514838021532}

In [41]:
pd.DataFrame(dict(model=scores.keys(), performance=scores.values())).sort_values(by=['performance'])

Unnamed: 0,model,performance
4,gbdt,0.216981
5,goss,0.217187
8,ordered,0.219779
6,dart,0.219894
7,plain,0.220634
2,vanila,0.221255
3,histo,0.222689
0,extra_trees,0.222765
1,random_forest,0.226236


In [None]:
pd.DataFrame(dict(model=scores.keys(), performance=scores.values())).sort_values(by=['performance'])

Unnamed: 0,model,performance
8,ordered,0.225806
4,gbdt,0.225815
5,goss,0.226093
6,dart,0.226623
3,histo,0.229107
7,plain,0.22948
1,random_forest,0.235887
0,extra_trees,0.241843
2,vanila,0.24203


In [None]:
ordered.save_model('base-ordered.bin', format='cbm')

In [None]:
scores.keys()

dict_keys(['extra_trees', 'random_forest', 'vanila', 'histo', 'gbdt', 'goss', 'dart', 'plain', 'ordered'])

In [None]:
import joblib as jb
models = [extra_trees, random_forest, vanila, histo, gbdt, goss, dart, plain, ordered]
for name, model in zip(scores.keys(), models):
    jb.dump(model, name+".joblib")


In [None]:
!cp /content/*.joblib /content/drive/MyDrive/Colab\ Notebooks/projects/bulldozers/models