### LightGBM + ADSTuner: do the ensemble
* two models, one for causal and the other one for registerd
* added feature engineering
* added year, removed temp
* removing day I got the best results. (The range of days in the train set don't match with test set)

In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

import seaborn as sns

# to use ADSTuner
from ads.hpo.search_cv import ADSTuner
from ads.hpo.stopping_criterion import *
from ads.hpo.distributions import *

from sklearn.metrics import make_scorer

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders
from utils import show_tuner_results, show_categoricals

# set seaborn look&feel
sns.set()

import logging

In [2]:
# globals and load train dataset
FIGSIZE = (9, 6)

# number of folds for K-fold cv in ADSTuner
FOLDS = 5

# in secs
TIME_BUDGET = 1800

FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

# train dataset
data_orig = pd.read_csv(FILE_TRAIN)

In [3]:
#
# add features
#
data_extended = add_features(data_orig)

# have a look
data_extended.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,year
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,19,2012
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,20,2012
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.91,61,15.0013,4,164,168,21,2012
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,22,2012
10885,2012-12-19 23:00:00,4,0,1,1,13.12,16.665,66,8.9981,4,84,88,23,2012


In [4]:
# ok, we will treat as categorical: holiday, hour, season, weather, windspeed, workingday, year

In [5]:
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ["datetime", "temp"]

# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

# let's code categorical
# windspeed need a special treatment
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 


In [6]:
cat_cols = ["season", "holiday", "workingday", "weather", "hour", "year"]
num_cols = ["atemp", "humidity", "windspeed"]
target_columns = ["casual", "registered", "count"]
features = sorted(cat_cols + num_cols)

# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

print("All columns:", len(all_columns))
print("Ignored columns:", len(del_columns))
print("Categorical columns:", len(cat_cols))
print("Numerical columns:", len(num_cols))
print(f"All targets: {len(target_columns)}")
print("All the features", len(features))

All columns: 14
Ignored columns: 2
Categorical columns: 6
Numerical columns: 3
All targets: 3
All the features 9


In [7]:
# loading the parameters for the two models
import pickle

with open("model1.pkl", "rb") as mode1_file:
    params1 = pickle.load(mode1_file)

    print(params1)

with open("model2.pkl", "rb") as mode1_file:
    params2 = pickle.load(mode1_file)

    print(params2)

{'learning_rate': 0.0037767852662056434, 'max_depth': 8, 'n_estimators': 3000}
{'learning_rate': 0.0052202070749113195, 'max_depth': 10, 'n_estimators': 3000}


In [8]:
%%time
# train model1
TARGET = "registered"

x_train = data_used[features]
y_train = data_used[TARGET]

# train the model with chosen parameters
model1 = lgb.LGBMRegressor(**params1)

model1.fit(x_train, y_train, categorical_feature=cat_columns_idxs)

CPU times: user 11.6 s, sys: 39.2 ms, total: 11.7 s
Wall time: 2.93 s


LGBMRegressor(learning_rate=0.0037767852662056434, max_depth=8,
              n_estimators=3000)

In [9]:
%%time
# train model2
TARGET = "casual"

x_train = data_used[features]
y_train = data_used[TARGET]

# train the model with chosen parameters
model2 = lgb.LGBMRegressor(**params2)

model2.fit(x_train, y_train, categorical_feature=cat_columns_idxs)

CPU times: user 11.1 s, sys: 51.7 ms, total: 11.1 s
Wall time: 2.79 s


LGBMRegressor(learning_rate=0.0052202070749113195, max_depth=10,
              n_estimators=3000)

In [10]:
# now we must combine predictions from model1 (registered) and model2 (causal))
test_orig = pd.read_csv(FILE_TEST)

# add engineered features
# feature engineering

test_orig = add_features(test_orig)

# coding
test_orig = apply_encoders(test_orig, le_list)

# data on which do scoring
x_test = test_orig[features]

Coding: season 
Coding: weather 
Coding: year 


In [11]:
# scoring

score_test1 = model1.predict(x_test)

score_test2 = model1.predict(x_test)

In [12]:
score_test = score_test1 + score_test2

In [13]:
df_sub = pd.read_csv("sampleSubmission.csv")

# remove decimals
df_sub["count"] = np.round(score_test, 0)

# remove eventual negative
condition = df_sub["count"] < 0

df_sub.loc[condition, "count"] = 0

In [14]:
FILE_SUB_PREFIX = "sub-test14"
FILE_SUB = FILE_SUB_PREFIX + ".csv"

df_sub.to_csv(FILE_SUB, index=False)

In [15]:
!kaggle competitions submit -c "bike-sharing-demand" -f $FILE_SUB -m "adstuner, two models"

100%|████████████████████████████████████████| 163k/163k [00:01<00:00, 93.8kB/s]
Successfully submitted to Bike Sharing Demand