# Feature Selection

In [None]:
%%bash
# pip uninstall lightgbm -y
# pip install lightgbm --install-option=--gpu
pip install catboost Boruta 


In [2]:
%%bash
# data
cp /content/drive/MyDrive/Colab\ Notebooks/projects/bulldozers/data/*.feather /content
# pre-trained models
cp /content/drive/MyDrive/Colab\ Notebooks/projects/bulldozers/models/ordered.joblib /content
# helper files
cp /content/drive/MyDrive/pyism/{utils,hiwamari,mrmr}.py /content

In [7]:
import catboost as cat
import lightgbm as lgb
import numpy as np
import pandas as pd
import joblib as jb
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
import utils


In [4]:
xtrn = pd.read_feather("xtrn.feather")
xval = pd.read_feather("xval.feather")
ytrn = np.log1p(pd.read_feather("ytrn.feather").iloc[:,0])
yval = np.log1p(pd.read_feather("yval.feather").iloc[:,0])

In [10]:
# base_ordr = cat.CatBoostRegressor().load_model('base-ordered.bin', format='cbm')
base_ordr = jb.load('ordered.joblib') 
base_ordr

<catboost.core.CatBoostRegressor at 0x7fd5d98492d0>

In [11]:
scores = dict()
def rmse(y, yh):
    return np.sqrt(np.mean((y - yh)**2))
 
def evaluate(model, name, x=xval, y=yval):
    yh = model.predict(x.values)
    s = rmse(y.values, yh)
    scores[name] = s
    return s

In [12]:
params = dict(
    loss_function='RMSE', eval_metric='RMSE', 
    n_estimators=3000, learning_rate=.09125,
    sampling_frequency='PerTree', max_depth=10,
    early_stopping_rounds=500, l2_leaf_reg=4.,
    grow_policy='SymmetricTree', task_type='GPU',
    boosting_type='Ordered',
    bootstrap_type='Bayesian',  bagging_temperature=1,
)

## Feature Importance

In [13]:
imp = base_ordr.get_feature_importance(type='PredictionValuesChange')
fi = utils.feature_importances(base_ordr, xtrn.columns, imp, .98)
fi

Unnamed: 0_level_0,importance,rank,norm_cummulative
features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YearMade,26.199362,1,0.261994
fiBaseModel,16.983571,2,0.431829
sale_year,14.265356,3,0.574483
fiProductClassDesc,12.617978,4,0.700663
fiSecondaryDesc,8.168089,5,0.782344
ProductSize,3.60125,6,0.818356
ModelID,2.994664,7,0.848303
Enclosure_EROPS w AC,1.953379,8,0.867837
Drive_System_nan,1.367562,9,0.881512
Enclosure_OROPS,0.973166,10,0.891244


In [14]:
fi_cols = fi.index.to_numpy()
print((len(fi_cols), len(xtrn.columns)))
fi_cols

(41, 139)


array(['YearMade', 'fiBaseModel', 'sale_year', 'fiProductClassDesc',
       'fiSecondaryDesc', 'ProductSize', 'ModelID',
       'Enclosure_EROPS w AC', 'Drive_System_nan', 'Enclosure_OROPS',
       'fiModelDescriptor', 'state', 'Tire_Size', 'fiModelSeries',
       'YearMade_outlier', 'sale_dayofyear',
       'Drive_System_Four Wheel Drive', 'Ripper_nan',
       'Ripper_None or Unspecified', 'auctioneerID_freq',
       'ProductGroupDesc_Wheel Loader', 'Hydraulics', 'Coupler_nan',
       'MachineID', 'sale_month', 'MachineHoursCurrentMeter',
       'Scarifier_nan', 'auctioneerID', 'Drive_System_Two Wheel Drive',
       'Pushblock_nan', 'sale_weekofyear', 'UsageBand', 'Blade_Type',
       'Tip_Control_nan', 'Forks_nan', 'datasource_freq',
       'ProductGroupDesc_Track Excavators', 'Enclosure_EROPS',
       'Track_Type_nan', 'MachineID_freq', 'sale_dayofweek'], dtype=object)

In [15]:
ordr_fi = cat.CatBoostRegressor(**params)
ordr_fi.fit(xtrn[fi_cols].values, ytrn,
          eval_set=[(xval[fi_cols].values,yval)],
          verbose=500)

0:	learn: 0.6482158	test: 0.6938546	best: 0.6938546 (0)	total: 122ms	remaining: 6m 4s
500:	learn: 0.2092339	test: 0.2358437	best: 0.2358437 (499)	total: 44.6s	remaining: 3m 42s
1000:	learn: 0.2016242	test: 0.2305493	best: 0.2305289 (997)	total: 1m 26s	remaining: 2m 52s
1500:	learn: 0.1983634	test: 0.2286326	best: 0.2286323 (1499)	total: 2m 7s	remaining: 2m 6s
2000:	learn: 0.1961341	test: 0.2277518	best: 0.2277518 (1999)	total: 2m 48s	remaining: 1m 24s
2500:	learn: 0.1945978	test: 0.2269399	best: 0.2269399 (2500)	total: 3m 30s	remaining: 42s
2999:	learn: 0.1936889	test: 0.2265974	best: 0.2265974 (2999)	total: 4m 10s	remaining: 0us
bestTest = 0.226597426
bestIteration = 2999


<catboost.core.CatBoostRegressor at 0x7fd5d9803090>

In [16]:
evaluate(ordr_fi, 'ordr_fi', xval[fi_cols])

0.22659699073737885

## Hiwamari

In [17]:
from hiwamari import hiwamari

In [18]:
%%time
hiwa_cols = hiwamari(estimator=DecisionTreeRegressor(max_depth=16),
                          X=xtrn, y=ytrn, max_samples=.2,
                          n_iters=25, early_stopping_rounds=5,
                          scale_factor=0.5, noise=None) 
print(hiwa_cols.shape)

  0%|          | 0/25 [00:00<?, ?it/s]

(43,)
CPU times: user 1min 24s, sys: 2.13 s, total: 1min 26s
Wall time: 1min 26s


In [19]:
hiwa_cols

array(['MachineID', 'ModelID', 'auctioneerID', 'YearMade',
       'MachineHoursCurrentMeter', 'fiBaseModel', 'fiSecondaryDesc',
       'fiModelSeries', 'fiModelDescriptor', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroupDesc_Wheel Loader',
       'ProductGroupDesc_Skid Steer Loaders',
       'ProductGroupDesc_Track Type Tractors',
       'Drive_System_Four Wheel Drive', 'Drive_System_Two Wheel Drive',
       'Drive_System_No', 'Enclosure_EROPS w AC', 'Enclosure_OROPS',
       'Stick_Extended', 'Transmission',
       'Blade_Extension_None or Unspecified', 'Hydraulics',
       'Ripper_None or Unspecified', 'Ripper_Multi Shank',
       'Scarifier_nan', 'Tire_Size', 'Track_Type_Steel',
       'Pattern_Changer_None or Unspecified', 'Blade_Type',
       'YearMade_outlier', 'MachineID_freq', 'ModelID_freq',
       'datasource_freq', 'auctioneerID_freq', 'sale_day', 'sale_year',
       'sale_dayofweek', 'sale_dayofyear', 'sale_days_in_month',
       'sale_weekofyear', 'Machi

In [20]:
ordr_hiwa = cat.CatBoostRegressor(**params)
ordr_hiwa.fit(xtrn[hiwa_cols].values, ytrn,
          eval_set=[(xval[hiwa_cols].values,yval)],
          verbose=500)

0:	learn: 0.6481156	test: 0.6942527	best: 0.6942527 (0)	total: 117ms	remaining: 5m 50s
500:	learn: 0.2088162	test: 0.2360975	best: 0.2360975 (500)	total: 47.7s	remaining: 3m 58s
1000:	learn: 0.2010552	test: 0.2306556	best: 0.2306550 (998)	total: 1m 30s	remaining: 3m 1s
1500:	learn: 0.1977786	test: 0.2288538	best: 0.2288534 (1497)	total: 2m 10s	remaining: 2m 10s
2000:	learn: 0.1958438	test: 0.2276244	best: 0.2276237 (1998)	total: 2m 49s	remaining: 1m 24s
2500:	learn: 0.1942496	test: 0.2269697	best: 0.2269593 (2498)	total: 3m 29s	remaining: 41.8s
2999:	learn: 0.1929904	test: 0.2263169	best: 0.2263169 (2999)	total: 4m 10s	remaining: 0us
bestTest = 0.2263169001
bestIteration = 2999


<catboost.core.CatBoostRegressor at 0x7fd5c9387790>

In [21]:
evaluate(ordr_hiwa, 'ordr_hiwa', xval[hiwa_cols])

0.22631681833828202

## MRMR

In [22]:
from mrmr import mrmr

In [29]:
%%time
mrmr_cols = mrmr(xtrn, ytrn, K=43, objective='regression', method='rfcq')
print(mrmr_cols.shape)

  0%|          | 0/42 [00:00<?, ?it/s]

(43,)
CPU times: user 49.5 s, sys: 326 ms, total: 49.8 s
Wall time: 49.5 s


In [30]:
mrmr_cols

array(['fiBaseModel', 'YearMade', 'sale_year', 'fiProductClassDesc',
       'fiSecondaryDesc', 'ModelID', 'sale_dayofyear', 'ProductSize',
       'Enclosure_EROPS w AC', 'YearMade_outlier',
       'Drive_System_Two Wheel Drive', 'Drive_System_Four Wheel Drive',
       'sale_day', 'Grouser_Tracks_None or Unspecified', 'fiModelSeries',
       'MachineHoursCurrentMeter', 'Ride_Control_None or Unspecified',
       'Enclosure_OROPS', 'state', 'auctioneerID', 'MachineID',
       'Ripper_None or Unspecified', 'Tire_Size', 'fiModelDescriptor',
       'sale_weekofyear', 'Hydraulics_Flow_Standard', 'Ripper_Yes',
       'ModelID_freq', 'Blade_Type', 'Hydraulics', 'auctioneerID_freq',
       'ProductGroupDesc_Skid Steer Loaders', 'MachineID_freq',
       'ProductGroupDesc_Track Type Tractors', 'sale_week',
       'datasource_freq', 'ProductGroupDesc_Track Excavators',
       'Transmission', 'Drive_System_nan', 'Ripper_nan',
       'Track_Type_Steel', 'ProductGroupDesc_Motor Graders',
       'sale_

In [31]:
ordr_mrmr = cat.CatBoostRegressor(**params)
ordr_mrmr.fit(xtrn[mrmr_cols].values, ytrn,
          eval_set=[(xval[mrmr_cols].values,yval)],
          verbose=500)



0:	learn: 0.6484915	test: 0.6932722	best: 0.6932722 (0)	total: 114ms	remaining: 5m 41s
500:	learn: 0.2089731	test: 0.2352269	best: 0.2352257 (497)	total: 47.8s	remaining: 3m 58s
1000:	learn: 0.2022832	test: 0.2302760	best: 0.2302760 (1000)	total: 1m 23s	remaining: 2m 47s
1500:	learn: 0.1982341	test: 0.2283050	best: 0.2282936 (1490)	total: 2m 3s	remaining: 2m 3s
2000:	learn: 0.1960394	test: 0.2274127	best: 0.2274113 (1988)	total: 2m 42s	remaining: 1m 20s
2500:	learn: 0.1947719	test: 0.2267411	best: 0.2267411 (2500)	total: 3m 18s	remaining: 39.7s
2999:	learn: 0.1936409	test: 0.2262643	best: 0.2262601 (2964)	total: 3m 59s	remaining: 0us
bestTest = 0.2262600796
bestIteration = 2964
Shrink model to first 2965 iterations.


<catboost.core.CatBoostRegressor at 0x7fd5d7fbaa90>

In [32]:
evaluate(ordr_mrmr, 'ordr_mrmr', xval[mrmr_cols])

0.22625961864526628

In [33]:
scores

{'ordr_fi': 0.22659699073737885,
 'ordr_hiwa': 0.22631681833828202,
 'ordr_mrmr': 0.22625961864526628}

In [None]:
scores

{'ordr_fi': 0.2268937069984216,
 'ordr_hiwa': 0.22706927124686374,
 'ordr_mrmr': 0.2257056887037802}

## Boruta

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor

boruta_selector = BorutaPy(RandomForestRegressor(max_depth=4),  
                           n_estimators=100, max_iter=10, perc=75,
                           verbose=20, two_step=False)

boruta_selector.fit(xtrn.values,ytrn.values)

Iteration: 	1 / 10
Confirmed: 	0
Tentative: 	138
Rejected: 	0


KeyboardInterrupt: ignored