In [1]:
import pandas as pd
import numpy as np

# the GBM used
import xgboost as xgb

from sklearn.model_selection import KFold

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders

import optuna

In [3]:
# globals and load train dataset

FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# add features
#
data_extended = add_features(data_orig)

all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ["datetime", "casual", "registered", "temp"]

TARGET = "count"
cat_cols = ["season", "holiday", "workingday", "weather", "windspeed", "hour", "year"]
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

print("All columns:", len(all_columns))
print("Ignored columns:", len(del_columns))
print("Target:", len([TARGET]))
print("Categorical columns:", len(cat_cols))
print("Numerical columns:", len(num_cols))
print("All the features", len(features))

data_used = data_extended.drop(del_columns, axis=1)

All columns: 14
Ignored columns: 4
Target: 1
Categorical columns: 7
Numerical columns: 2
All the features 9


In [5]:
data_used.tail()

Unnamed: 0,season,holiday,workingday,weather,atemp,humidity,windspeed,count,hour,year
10881,4,0,1,1,19.695,50,26.0027,336,19,2012
10882,4,0,1,1,17.425,57,15.0013,241,20,2012
10883,4,0,1,1,15.91,61,15.0013,168,21,2012
10884,4,0,1,1,17.425,61,6.0032,129,22,2012
10885,4,0,1,1,16.665,66,8.9981,88,23,2012


In [10]:
# encode
# let's code categorical
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 


In [11]:
x = data_used[features].values
y = data_used[TARGET].values

In [12]:
model = xgb.XGBRegressor()
model.load_model(fname="xgboost.txt")

In [19]:
y_pred = model.predict(x)

In [20]:
y_pred = np.where(y_pred >= 0, y_pred, 0)

In [23]:
data_used["pred"] = np.round(y_pred, 0).astype(int)

In [28]:
data_used.head(20)

Unnamed: 0,season,holiday,workingday,weather,atemp,humidity,windspeed,count,hour,year,pred
0,0,0,0,0,14.395,81,0.0,16,0,0,28
1,0,0,0,0,13.635,80,0.0,40,1,0,30
2,0,0,0,0,13.635,80,0.0,32,2,0,24
3,0,0,0,0,14.395,75,0.0,13,3,0,10
4,0,0,0,0,14.395,75,0.0,1,4,0,1
5,0,0,0,1,12.88,75,6.0032,1,5,0,0
6,0,0,0,0,13.635,80,0.0,2,6,0,2
7,0,0,0,0,12.88,86,0.0,3,7,0,0
8,0,0,0,0,14.395,75,0.0,8,8,0,6
9,0,0,0,0,17.425,76,0.0,14,9,0,50


In [26]:
data_used["atemp"].nunique()

60

In [27]:
data_used["windspeed"].nunique()

28