### XGBoost + ADSTuner
* added feature engineering
* added year, removed temp
* removing day I got the best results. (The range of days in the train set don't match with test set)


In [None]:
import pandas as pd
import numpy as np

import lightgbm as lgb

# to use ADSTuner
from ads.hpo.search_cv import ADSTuner
from ads.hpo.stopping_criterion import *
from ads.hpo.distributions import *

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import make_scorer

import seaborn as sns
import matplotlib.pyplot as plt

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders 
from utils import show_tuner_results, show_categoricals

# set seaborn look&feel
sns.set()

In [None]:
# globals and load train dataset

STUDY_NAME = "Bike sharing11"

# number of folds for K-fold cv in ADSTuner
FOLDS = 7

# in secs
TIME_BUDGET = 3600

FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

In [None]:
# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# add features
#
data_extended = add_features(data_orig)

# have a look
data_extended.tail()

In [None]:
show_categoricals(data_extended, thr=100)

In [None]:
# ok, we will treat as categorical: holiday, hour, season, weather, windspeed, workingday, year

In [None]:
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ['datetime', 'casual', 'registered', 'temp']

TARGET = "count"
cat_cols = ['season', 'holiday','workingday', 'weather', 'windspeed', 'hour', 'year']
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

print('All columns:', len(all_columns))
print('Ignored columns:', len(del_columns))
print('Target:', len([TARGET]))
print('Categorical columns:', len(cat_cols))
print('Numerical columns:', len(num_cols))
print('All the features', len(features))

In [None]:
# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

In [None]:
# let's code categorical
# windspeed need a special treatment
le_list = train_encoders(data_extended)

# coding
data_used = apply_encoders(data_used, le_list)

# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

### ADSTuner session

In [None]:
#
# Here we define the strategy, the space for hyper-parameters we want to explore
#
params = {'n_estimators': CategoricalDistribution([1000, 2000, 3000, 4000, 5000]),
          'learning_rate': LogUniformDistribution(low=1e-4, high=1e-2),
          'max_depth': IntUniformDistribution(5, 10),
          'use_best_model': True,
          'categorical_feature' : cat_columns_idxs,
         }

alg_reg = lgb.LGBMRegressor()

# define the scorer function for ADSTuner, see def for rmsle before
scorer = make_scorer(rmsle, greater_is_better=False)

# per lista scorer sorted(sklearn.metrics.SCORERS.keys())
tuner = ADSTuner(alg_reg, cv=FOLDS, strategy=params, scoring=scorer, study_name=STUDY_NAME)

x_train = data_used[features]
y_train = data_used[TARGET]

tuner.tune(x_train, y_train, exit_criterion=[TimeBudget(TIME_BUDGET)])

### Analyze trials

In [None]:
# get the status to see if completed
print(f'The tuner status is: {tuner.get_status()}')

print(f'Remaining time is: {round(tuner.time_remaining, 1)} sec.')

In [None]:
# look only at completed trials, sorted with best on top. Metric chosen is in the value col.
result_df = tuner.trials[tuner.trials['state'] == 'COMPLETE'].sort_values(by=['value'], ascending=False)

result_df.head(10)

In [None]:
show_tuner_results(tuner)

In [None]:
tuner.plot_best_scores()

### train the model with the best params

In [None]:
%%time

model = lgb.LGBMRegressor(**tuner.best_params)

model.fit(x_train, y_train, categorical_feature=cat_columns_idxs)

### Prediction and submission to Kaggle

In [None]:
test_orig = pd.read_csv(FILE_TEST)

In [None]:
# add engineered features
# feature engineering

test_orig = add_features(test_orig)

# coding
test_orig = apply_encoders(test_orig, le_list)

# data on which do scoring
x_test = test_orig[features]

In [None]:
# scoring

score_test = model.predict(x_test)

### prepare submission

In [None]:
df_sub = pd.read_csv("sampleSubmission.csv")

In [None]:
# remove decimals
df_sub["count"] = np.round(score_test, 0)

# remove eventual negative
condition = df_sub["count"] < 0

df_sub.loc[condition, "count"] = 0

In [None]:
FILE_SUB_PREFIX = "sub-demo-001"
FILE_SUB = FILE_SUB_PREFIX + ".csv"

df_sub.to_csv(FILE_SUB, index=False)

### Submission

In [None]:
!kaggle competitions submit -c "bike-sharing-demand" -f $FILE_SUB -m "sub demo 001, adstuner"

In [None]:
print(lgb.__version__)