### Catboost3
* added feature engineering
* added year, removed temp
* removing day I got the best result (0.4998 so far). The range of days in the train set don't match with test set

In [1]:
import pandas as pd
import numpy as np
import catboost as cat

from sklearn.model_selection import KFold

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# globals
FIGSIZE = (9, 6)

SEED = 42

FILE_TRAIN = "train.csv"

In [3]:
data_orig = pd.read_csv(FILE_TRAIN)

# feature engineering
data_orig['datetime'] = pd.to_datetime(data_orig['datetime'])

# this way I add 3 engineered features
data_orig['hour'] = data_orig['datetime'].dt.hour

# mese di fatto è duplicato di season. rimuovo
#data_orig['month'] = data_orig['datetime'].dt.month
# rimuovo day
# data_orig['day'] = data_orig['datetime'].dt.day
data_orig['year'] = data_orig['datetime'].dt.year

In [4]:
all_columns = data_orig.columns

# colonne da ignorare
# atemp and temp are strongly correlated (0.98) taking only one
del_columns = ['datetime', 'casual', 'registered', 'temp']

TARGET = "count"

cat_cols = ['season', 'holiday','workingday', 'weather', 'hour', 'year']
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

print('Tutte le colonne:', len(all_columns))
print('Colonne ignorate:', len(del_columns))
print('target:', len([TARGET]))
print('Colonne cat:', len(cat_cols))
print('Colonne num:', len(num_cols))
print('Num. features', len(features))

Tutte le colonne: 14
Colonne ignorate: 4
target: 1
Colonne cat: 6
Colonne num: 3
Num. features 9


In [5]:
data_used = data_orig.drop(del_columns, axis=1)

In [6]:
# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

In [7]:
%%time

FOLDS = 5

skf = KFold(n_splits = FOLDS, shuffle=True, random_state = SEED)

params = {'iterations':6000,
          'learning_rate':0.005,
          'depth':10,
          'task_type':"GPU",
          'use_best_model': True
         }

best_models = []

i = 1
for train_idx, valid_idx in skf.split(data_used):
    print()
    print('Processing fold:', i)
    
    data_train = data_used.iloc[train_idx]
    data_valid = data_used.iloc[valid_idx]
    
    x_train = data_train[features]
    y_train = data_train[TARGET]

    x_valid = data_valid[features]
    y_valid = data_valid[TARGET]
    
    model = cat.CatBoostRegressor(**params)

    model.fit(x_train, y_train, cat_columns_idxs, verbose=500, early_stopping_rounds=50, eval_set=(x_valid, y_valid))
    
    best_models.append(model)
    
    i  += 1


Processing fold: 1
0:	learn: 180.4119412	test: 181.0975720	best: 181.0975720 (0)	total: 49.3ms	remaining: 4m 55s
500:	learn: 70.8799440	test: 69.9929683	best: 69.9929683 (500)	total: 32.4s	remaining: 5m 55s
1000:	learn: 54.7896277	test: 53.9784907	best: 53.9784907 (1000)	total: 1m 8s	remaining: 5m 40s
1500:	learn: 50.4958009	test: 50.2592978	best: 50.2592978 (1500)	total: 1m 42s	remaining: 5m 7s
2000:	learn: 48.7285965	test: 48.8938314	best: 48.8938314 (2000)	total: 2m 15s	remaining: 4m 31s
2500:	learn: 47.5487227	test: 48.0098585	best: 48.0098585 (2500)	total: 2m 50s	remaining: 3m 58s
3000:	learn: 46.8291199	test: 47.4904514	best: 47.4904514 (3000)	total: 3m 23s	remaining: 3m 23s
3500:	learn: 46.0732750	test: 46.9403990	best: 46.9403990 (3500)	total: 3m 54s	remaining: 2m 47s
4000:	learn: 45.3761293	test: 46.4503077	best: 46.4503077 (4000)	total: 4m 25s	remaining: 2m 12s
4500:	learn: 44.8811527	test: 46.1060282	best: 46.1060282 (4500)	total: 4m 56s	remaining: 1m 38s
5000:	learn: 44.40

### Submission

In [8]:
test_orig = pd.read_csv("test.csv")

In [9]:
# add engineered features
# feature engineering
test_orig['datetime'] = pd.to_datetime(test_orig['datetime'])

# this way I add 3 engineered features
test_orig['hour'] = test_orig['datetime'].dt.hour
#test_orig['month'] = test_orig['datetime'].dt.month
# test_orig['day'] = test_orig['datetime'].dt.day
test_orig['year'] = test_orig['datetime'].dt.year

In [10]:
# data on which do scoring
x_test = test_orig[features]

avg_score = np.zeros((x_test.shape[0],))
                     
i = 0
for model in best_models:
    print()
    print('Predictions from model', i)
    
    score_test = model.predict(x_test)
    
    avg_score += score_test/float(FOLDS)
                     
    i += 1


Predictions from model 0

Predictions from model 1

Predictions from model 2

Predictions from model 3

Predictions from model 4


In [11]:
df_sub = pd.read_csv("sampleSubmission.csv")

In [12]:
df_sub["count"] = avg_score

In [13]:
condition = df_sub["count"] < 0

df_sub.loc[condition, "count"] = 0

In [14]:
FILE_SUB = "submission10.csv"

df_sub.to_csv(FILE_SUB, index=False)

In [15]:
!kaggle competitions submit -c "bike-sharing-demand" -f $FILE_SUB -m "sub10, kfold cv, no day"

100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 112kB/s]
Successfully submitted to Bike Sharing Demand

## Compute rmsle, to get an idea before submission

In [16]:
def get_rmsle(y_pred, y_actual):
    diff = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

In [23]:
skf = KFold(n_splits = FOLDS, shuffle=True, random_state = SEED)

avg_rmsle = 0.

i = 0
for train_idx, valid_idx in skf.split(data_used):
    print()
    print('Processing fold:', i+1)
    
    data_train = data_used.iloc[train_idx]
    data_valid = data_used.iloc[valid_idx]

    x_valid = data_valid[features]
    y_valid = data_valid[TARGET]
    
    model = best_models[i]
    
    y_pred = model.predict(x_valid)
    
    rmsle = get_rmsle(y_pred, y_valid)
    
    print('RMSLE for split is:', round(rmsle, 4))
    
    avg_rmsle += rmsle/float(FOLDS)
    
    i += 1

print()
print('Avg rmsle is:', round(avg_rmsle, 4))


Processing fold: 1
RMSLE for split is: 0.4776

Processing fold: 2
RMSLE for split is: 0.4729

Processing fold: 3
RMSLE for split is: 0.4184

Processing fold: 4
RMSLE for split is: 0.4463

Processing fold: 5
RMSLE for split is: 0.4165

Avg rmsle is: 0.4463


  
