### LightGBM single-model
* added feature engineering
* added year, removed temp
* removing day I got the best results. (The range of days in the train set don't match with test set)

In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import make_scorer

import seaborn as sns
import matplotlib.pyplot as plt

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders 
from utils import show_tuner_results, show_categoricals

# set seaborn look&feel
sns.set()

In [2]:
# globals and load train dataset

FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

In [3]:
# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# add features
#
data_extended = add_features(data_orig)

# have a look
data_extended.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,year
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,19,2012
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,20,2012
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.91,61,15.0013,4,164,168,21,2012
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,22,2012
10885,2012-12-19 23:00:00,4,0,1,1,13.12,16.665,66,8.9981,4,84,88,23,2012


In [None]:
# ok, we will treat as categorical: holiday, hour, season, weather, windspeed, workingday, year

In [4]:
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ['datetime', 'casual', 'registered', 'temp']

TARGET = "count"
cat_cols = ['season', 'holiday','workingday', 'weather', 'windspeed', 'hour', 'year']
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

print('All columns:', len(all_columns))
print('Ignored columns:', len(del_columns))
print('Target:', len([TARGET]))
print('Categorical columns:', len(cat_cols))
print('Numerical columns:', len(num_cols))
print('All the features', len(features))

All columns: 14
Ignored columns: 4
Target: 1
Categorical columns: 7
Numerical columns: 2
All the features 9


In [5]:
# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

In [6]:
# let's code categorical
# windspeed need a special treatment
le_list = train_encoders(data_extended)

# coding
data_used = apply_encoders(data_used, le_list)

# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 


In [9]:
%%time

best_params = {'learning_rate': 0.0073250798169353214, 'max_depth': 9, 'n_estimators': 2000}

x_train = data_used[features]
y_train = data_used[TARGET]

model = lgb.LGBMRegressor(**best_params)

model.fit(x_train, y_train, categorical_feature=cat_columns_idxs)

CPU times: user 8.31 s, sys: 27.8 ms, total: 8.34 s
Wall time: 2.09 s


LGBMRegressor(learning_rate=0.0073250798169353214, max_depth=9,
              n_estimators=2000)

array([14122,  1412, 13029, 11826,  2322,  3732,  4027,  5509,  4021],
      dtype=int32)