### Catboost3
* added feature engineering
* added year, removed temp

In [1]:
import pandas as pd
import numpy as np
import catboost as cat

from sklearn.model_selection import KFold

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# globals
FIGSIZE = (9, 6)

SEED = 42

FILE_TRAIN = "train.csv"

In [3]:
data_orig = pd.read_csv(FILE_TRAIN)

# feature engineering
data_orig['datetime'] = pd.to_datetime(data_orig['datetime'])

# this way I add 3 engineered features
data_orig['hour'] = data_orig['datetime'].dt.hour

# mese di fatto è duplicato di season. rimuovo
#data_orig['month'] = data_orig['datetime'].dt.month
data_orig['day'] = data_orig['datetime'].dt.day
data_orig['year'] = data_orig['datetime'].dt.year

In [4]:
all_columns = data_orig.columns

# colonne da ignorare
# atemp and temp are strongly correlated (0.98) taking only one
del_columns = ['datetime', 'casual', 'registered', 'temp']

TARGET = "count"

cat_cols = ['season', 'holiday','workingday', 'weather', 'hour', 'day', 'year']
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

print('Tutte le colonne:', len(all_columns))
print('Colonne ignorate:', len(del_columns))
print('target:', len([TARGET]))
print('Colonne cat:', len(cat_cols))
print('Colonne num:', len(num_cols))
print('Num. features', len(features))

Tutte le colonne: 15
Colonne ignorate: 4
target: 1
Colonne cat: 7
Colonne num: 3
Num. features 10


In [5]:
data_used = data_orig.drop(del_columns, axis=1)

In [6]:
# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

In [7]:
%%time

FOLDS = 5

skf = KFold(n_splits = FOLDS, shuffle=True, random_state = SEED)

params = {'iterations':6000,
          'learning_rate':0.005,
          'depth':10,
          'task_type':"GPU",
          'use_best_model': True
         }

best_models = []

i = 1
for train_idx, valid_idx in skf.split(data_used):
    print()
    print('Processing fold:', i)
    
    data_train = data_used.iloc[train_idx]
    data_valid = data_used.iloc[valid_idx]
    
    x_train = data_train[features]
    y_train = data_train[TARGET]

    x_valid = data_valid[features]
    y_valid = data_valid[TARGET]
    
    model = cat.CatBoostRegressor(**params)

    model.fit(x_train, y_train, cat_columns_idxs, verbose=500, early_stopping_rounds=50, eval_set=(x_valid, y_valid))
    
    best_models.append(model)
    
    i  += 1


Processing fold: 1
0:	learn: 180.3682142	test: 181.0423853	best: 181.0423853 (0)	total: 96.6ms	remaining: 9m 39s
500:	learn: 64.1827354	test: 63.8051729	best: 63.8051729 (500)	total: 40.3s	remaining: 7m 22s
1000:	learn: 51.7588619	test: 51.5987779	best: 51.5987779 (1000)	total: 1m 20s	remaining: 6m 42s
1500:	learn: 48.3793718	test: 48.6926466	best: 48.6926466 (1500)	total: 1m 58s	remaining: 5m 55s
2000:	learn: 46.7089941	test: 47.4000229	best: 47.4000229 (2000)	total: 2m 35s	remaining: 5m 10s
2500:	learn: 45.6635858	test: 46.6261483	best: 46.6261483 (2500)	total: 3m 14s	remaining: 4m 32s
3000:	learn: 44.9076759	test: 46.0805255	best: 46.0805255 (3000)	total: 3m 52s	remaining: 3m 51s
3500:	learn: 44.4132936	test: 45.7332391	best: 45.7332039 (3498)	total: 4m 26s	remaining: 3m 10s
4000:	learn: 43.8125758	test: 45.3330548	best: 45.3330548 (4000)	total: 5m	remaining: 2m 30s
4500:	learn: 43.2834594	test: 45.0061087	best: 45.0061087 (4499)	total: 5m 41s	remaining: 1m 53s
5000:	learn: 42.7702

### Submission

In [8]:
test_orig = pd.read_csv("test.csv")

In [9]:
# add engineered features
# feature engineering
test_orig['datetime'] = pd.to_datetime(test_orig['datetime'])

# this way I add 3 engineered features
test_orig['hour'] = test_orig['datetime'].dt.hour
#test_orig['month'] = test_orig['datetime'].dt.month
test_orig['day'] = test_orig['datetime'].dt.day
test_orig['year'] = test_orig['datetime'].dt.year

In [16]:
# data on which do scoring
x_test = test_orig[features]

avg_score = np.zeros((x_test.shape[0],))
                     
i = 0
for model in best_models:
    print()
    print('Predictions from model', i)
    
    score_test = model.predict(x_test)
    
    avg_score += score_test/float(FOLDS)
                     
    i += 1


Predictions from model 0

Predictions from model 1

Predictions from model 2

Predictions from model 3

Predictions from model 4


In [17]:
df_sub = pd.read_csv("sampleSubmission.csv")

In [18]:
df_sub["count"] = avg_score

In [19]:
condition = df_sub["count"] < 0

df_sub.loc[condition, "count"] = 0

In [21]:
FILE_SUB = "submission8.csv"

df_sub.to_csv(FILE_SUB, index=False)

In [22]:
!kaggle competitions submit -c "bike-sharing-demand" -f $FILE_SUB -m "sub8, kfold cv"

100%|████████████████████████████████████████| 240k/240k [00:02<00:00, 98.4kB/s]
Successfully submitted to Bike Sharing Demand