## Gradient Boosting

#### Implemetations
* [XGBoost](https://github.com/dmlc/xgboost)
* [LightGBM](https://github.com/Microsoft/LightGBM)
* [CatBoost](https://github.com/catboost/catboost)

In [2]:
import pandas as pd
import numpy as np
np.random.seed(2017)

from sklearn.datasets import make_regression, make_classification
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, f1_score

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import xgboost as xgb
import lightgbm as lgb

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings('ignore', r"compiletime version 3.5 of module '_catboost' does not match runtime version 3.6")

    import catboost as ctb

# Regression

Generating data 10000 rows i 100 cols (features).

In [3]:
X, y = make_regression(n_samples=10000, n_features=100, random_state=2017)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## DecisionTreeRegressor and RandomForestRegressor
for comparison

In [4]:
model = DecisionTreeRegressor(random_state=2017)

%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

CPU times: user 748 ms, sys: 0 ns, total: 748 ms
Wall time: 747 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.73 ms


0.41278604022033405

In [5]:
model = RandomForestRegressor(random_state=2017)

%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

CPU times: user 4.39 s, sys: 4 ms, total: 4.4 s
Wall time: 4.39 s
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 9.35 ms


0.74578295159570329

## XGBRegressor

In [6]:
model = xgb.XGBRegressor(seed=2017)
model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=2017,
       silent=True, subsample=1)

In [7]:
%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

CPU times: user 3.66 s, sys: 0 ns, total: 3.66 s
Wall time: 3.66 s
CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 14.9 ms


0.93346519538224304

## LGBMRegressor

In [8]:
model = lgb.LGBMRegressor(n_estimators=100, random_state=2017) #xgb domyslnie ma 100
model

LGBMRegressor(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
       max_bin=255, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=2017,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [9]:
%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

CPU times: user 2.5 s, sys: 28 ms, total: 2.53 s
Wall time: 2.53 s
CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 34.9 ms


0.96352677264544873

## CatBoostRegressor

In [10]:
model = ctb.CatBoostRegressor()

%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)

r2_score(y_test, y_pred)

CPU times: user 1min 19s, sys: 896 ms, total: 1min 20s
Wall time: 12.8 s
CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 35.6 ms


0.98782613618634685

# Classification

Generating data 10000 rows i 100 cols (features).

In [11]:
X, y = make_classification(n_samples=10000, n_features=100, random_state=2017)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## DecisionTreeClassifier i RandomForestClassifier
Just checking

In [12]:
model = DecisionTreeClassifier(random_state=2017)

%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

CPU times: user 1.26 s, sys: 0 ns, total: 1.26 s
Wall time: 1.26 s
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.17 ms


0.92010832769126605

In [13]:
model = RandomForestClassifier(random_state=2017)

%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

CPU times: user 736 ms, sys: 4 ms, total: 740 ms
Wall time: 740 ms
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 6.33 ms


0.92617449664429541

## XGBClassifier

In [14]:
model = xgb.XGBClassifier(seed=2017)
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=2017,
       silent=True, subsample=1)

In [15]:
%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

CPU times: user 3.69 s, sys: 4 ms, total: 3.69 s
Wall time: 3.69 s
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 11.5 ms


0.9429530201342281

## LGBMClassifier

In [None]:
model = lgb.LGBMClassifier(n_estimators=100, random_state=2017) #xgb by defalut has 100
model

In [17]:
%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

CPU times: user 2.14 s, sys: 12 ms, total: 2.15 s
Wall time: 2.15 s
CPU times: user 28 ms, sys: 0 ns, total: 28 ms
Wall time: 26.3 ms


0.94375210508588747

## CatBoostClassifier

In [18]:
model = ctb.CatBoostClassifier()

%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)

f1_score(y_test, y_pred)

CPU times: user 1min 32s, sys: 3.65 s, total: 1min 36s
Wall time: 17.2 s
CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 39.6 ms


0.94283792871553462

## Hand tunning hyperparameters

In [31]:
df = pd.read_hdf('../input/train.adult.h5')

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


%matplotlib inline

In [33]:
df = df.fillna(-1)
cat_feats = df.select_dtypes(include=[np.object]).columns

for cat_feat in cat_feats:
    df['{0}_cat'.format(cat_feat)] = pd.factorize( df[cat_feat] )[0]

df['Sex_cat'] = df['Sex'].map(lambda x: int(x=True))
    
train = df.select_dtypes(include=[np.int8, np.int16, np.int64])
train.info()
feats = train.columns.values
feats = list(feats)
feats.remove('Target_cat')
feats.remove('Education_cat')
feats = np.array(feats)

def train_and_predict(model, X_train, y_train, X_test, y_test, success_metric=accuracy_score):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return success_metric(y_test, y_pred)

models = [
    xgb.XGBClassifier(seed=2017),
    lgb.LGBMClassifier(n_estimators=100, random_state=2017),
    ctb.CatBoostClassifier()
]

X = train[feats]
y = train['Target_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 15 columns):
Age                   32561 non-null int8
fnlwgt                32561 non-null int64
Education-Num         32561 non-null int64
Capital Gain          32561 non-null int16
Capital Loss          32561 non-null int16
Hours per week        32561 non-null int8
Workclass_cat         32561 non-null int64
Education_cat         32561 non-null int64
Martial Status_cat    32561 non-null int64
Occupation_cat        32561 non-null int64
Relationship_cat      32561 non-null int64
Race_cat              32561 non-null int64
Country_cat           32561 non-null int64
Target_cat            32561 non-null int64
Sex_cat               32561 non-null int64
dtypes: int16(2), int64(11), int8(2)
memory usage: 3.2 MB


In [34]:
scores = []
for model in models:
    score = train_and_predict(model, X_train, y_train, X_test, y_test)
    scores.append((score, model))
scores

[(0.86375268707134811,
  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
         colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
         max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
         n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
         reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=2017,
         silent=True, subsample=1)),
 (0.87501279557784828,
  LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
          max_bin=255, max_depth=-1, min_child_samples=20,
          min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
          n_jobs=-1, num_leaves=31, objective=None, random_state=2017,
          reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
          subsample_for_bin=200000, subsample_freq=1)),
 (0.87347732623605279, <catboost.core.CatBoostClassifier at 0x7f8a2d7cb978>)]