### LightGBM + ADSTuner
* two models, one for causal and the other one for registerd
* added feature engineering
* added year, removed temp
* removing day I got the best results. (The range of days in the train set don't match with test set)


In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

import seaborn as sns

# to use ADSTuner
from ads.hpo.search_cv import ADSTuner
from ads.hpo.stopping_criterion import *
from ads.hpo.distributions import *

from sklearn.metrics import make_scorer

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders 
from utils import show_tuner_results, show_categoricals

# set seaborn look&feel
sns.set()

import logging

In [None]:
# globals and load train dataset

# number of folds for K-fold cv in ADSTuner
FOLDS = 5

# in secs
TIME_BUDGET = 7200

FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

# train dataset
data_orig = pd.read_csv(FILE_TRAIN)

In [None]:
#
# add features
#
data_extended = add_features(data_orig)

# have a look
data_extended.tail()

In [None]:
# give a better look at cols with low cardinality
# to decide which one we want to treat as categoricals

# in utils.py
# THR = 100
show_categoricals(data_extended, 100)

In [None]:
# ok, we will treat as categorical: holiday, hour, season, weather, workingday, year

In [None]:
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ['datetime', 'temp']

# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

# let's code categorical
# windspeed need a special treatment
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

In [None]:
cat_cols = ['season', 'holiday','workingday', 'weather', 'hour', 'year']
num_cols = ['atemp', 'humidity', 'windspeed']
target_columns = ['casual', 'registered', 'count']
features = sorted(cat_cols + num_cols)

# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

print('All columns:', len(all_columns))
print('Ignored columns:', len(del_columns))
print('Categorical columns:', len(cat_cols))
print('Numerical columns:', len(num_cols))
print(f'All targets: {len(target_columns)}')
print('All the features', len(features))

### ADSTuner session: first model, target = registered

In [None]:
TARGET = 'registered'

#
# Here we define the strategy, the space for hyper-parameters we want to explore
#
params = {'n_estimators': CategoricalDistribution([1000, 2000, 3000, 4000, 5000]),
          'learning_rate': LogUniformDistribution(low=1e-5, high=1e-2),
          'max_depth': IntUniformDistribution(5, 10),
          'use_best_model': True,
          'categorical_feature' : cat_columns_idxs,
         }

alg_reg = lgb.LGBMRegressor()

# define the scorer function for ADSTuner, see def for rmsle before
scorer = make_scorer(rmsle, greater_is_better=False)

# per lista scorer sorted(sklearn.metrics.SCORERS.keys())
tuner = ADSTuner(alg_reg, cv=FOLDS, strategy=params, scoring=scorer, study_name="study1")

x_train = data_used[features]
y_train = data_used[TARGET]

tuner.tune(x_train, y_train, exit_criterion=[TimeBudget(TIME_BUDGET)])

In [None]:
# get the status to see if completed
tuner.get_status()

In [None]:
show_tuner_results(tuner)

In [None]:
# look only at completed trials, sorted with best on top. Metric chosen is in the value col.
result_df = tuner.trials[tuner.trials['state'] == 'COMPLETE'].sort_values(by=['value'], ascending=False)

result_df.head(5)

In [None]:
%%time
# train the model with chosen parameters
model1 = lgb.LGBMRegressor(**tuner.best_params)

model1.fit(x_train, y_train, categorical_feature=cat_columns_idxs)

In [None]:
import pickle

# saving the model best params
tuner.best_params

with open("model1.pkl", "wb") as mode1_file:
    pickle.dump(tuner.best_params, mode1_file)