In [31]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import catboost as cat
import xgboost as xgb

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import make_scorer

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders
from utils import show_tuner_results, show_categoricals

In [32]:
FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

data_orig = pd.read_csv(FILE_TRAIN)
test_orig = pd.read_csv(FILE_TEST)

data_extended = add_features(data_orig)

TARGET = "count"
all_columns = data_extended.columns
del_columns = ["datetime", "casual", "registered", "temp"]
cat_cols = ["season", "holiday", "workingday", "weather", "windspeed", "hour", "year"]
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)


df_sub = pd.read_csv("sampleSubmission.csv")

In [33]:
test_orig = add_features(test_orig)

data_used = data_extended.drop(del_columns, axis=1)

# I need the train set to get le list
le_list = train_encoders(data_used)

data_used = apply_encoders(data_used, le_list)
test_orig = apply_encoders(test_orig, le_list)

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 
Coding: season 
Coding: weather 
Coding: year 


In [34]:
x_train = data_used[features]
y_train = data_used[TARGET]

In [35]:
x_test = test_orig[features]

In [26]:
# submit with lgb
model_file = "lgboost.txt"

model = lgb.Booster(model_file=model_file)

In [27]:
score_test = model.predict(x_test)

In [28]:
# remove decimals
df_sub["count"] = np.round(score_test, 0)

# remove eventual negative
condition = df_sub["count"] < 0

df_sub.loc[condition, "count"] = 0

In [29]:
FILE_SUB_PREFIX = "lgb-opt-003"
FILE_SUB = FILE_SUB_PREFIX + ".csv"

df_sub.to_csv(FILE_SUB, index=False)

In [30]:
!kaggle competitions submit -c "bike-sharing-demand" -f $FILE_SUB -m "lgb opt 003"

100%|████████████████████████████████████████| 162k/162k [00:02<00:00, 77.6kB/s]
Successfully submitted to Bike Sharing Demand

### catboost

In [36]:
regr = cat.CatBoostRegressor()

model = regr.load_model("catboost.cbm")

In [37]:
score_test = model.predict(x_test)

In [38]:
# remove decimals
df_sub["count"] = np.round(score_test, 0)

# remove eventual negative
condition = df_sub["count"] < 0

df_sub.loc[condition, "count"] = 0

In [39]:
FILE_SUB_PREFIX = "cat-opt-002"
FILE_SUB = FILE_SUB_PREFIX + ".csv"

df_sub.to_csv(FILE_SUB, index=False)

In [40]:
!kaggle competitions submit -c "bike-sharing-demand" -f $FILE_SUB -m "cat opt 002"

100%|████████████████████████████████████████| 162k/162k [00:02<00:00, 70.0kB/s]
Successfully submitted to Bike Sharing Demand

### XGBoost

In [None]:
%%time

# best_params = {'learning_rate': 0.0004394857413078558, 'max_depth': 9, 'n_estimators': 5000}

# model = xgb.XGBRegressor(**best_params)

# model.fit(x_train, y_train)

In [16]:
model = xgb.XGBRegressor()
model.load_model(fname="xgboost.txt")

In [17]:
score_test = model.predict(x_test)

In [18]:
# remove decimals
df_sub["count"] = np.round(score_test, 0)

# remove eventual negative
condition = df_sub["count"] < 0

df_sub.loc[condition, "count"] = 0

In [19]:
FILE_SUB_PREFIX = "xgb-opt-006"
FILE_SUB = FILE_SUB_PREFIX + ".csv"

df_sub.to_csv(FILE_SUB, index=False)

In [20]:
!kaggle competitions submit -c "bike-sharing-demand" -f $FILE_SUB -m "xgb opt 006"

100%|████████████████████████████████████████| 162k/162k [00:02<00:00, 74.4kB/s]
Successfully submitted to Bike Sharing Demand