In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import skew, kurtosis, linregress
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
dates = pd.read_csv('dates.csv')
stores = pd.read_csv('stores.csv')
holidays = pd.read_csv('holidays.csv')
sample_sub = pd.read_csv('SampleSubmission.csv')

In [20]:
train['is_test'] = False
test['is_test'] = True
all_data = pd.concat([train, test])
all_data

Unnamed: 0,date,store_id,category_id,target,onpromotion,nbr_of_transactions,is_test
0,365,store_1,category_24,0.0,0,0.0,False
1,365,store_1,category_21,0.0,0,0.0,False
2,365,store_1,category_32,0.0,0,0.0,False
3,365,store_1,category_18,0.0,0,0.0,False
4,365,store_1,category_26,0.0,0,0.0,False
...,...,...,...,...,...,...,...
99787,1682,store_9,category_23,,0,,True
99788,1682,store_9,category_20,,1,,True
99789,1682,store_9,category_15,,7,,True
99790,1682,store_9,category_29,,8,,True


In [21]:
dates

Unnamed: 0,date,year,month,dayofmonth,dayofweek,dayofyear,weekofyear,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,year_weekofyear
0,365,1,1,1,2,1,1,1,True,False,True,False,True,False,101
1,366,1,1,2,3,2,1,1,False,False,False,False,False,False,101
2,367,1,1,3,4,3,1,1,False,False,False,False,False,False,101
3,368,1,1,4,5,4,1,1,False,False,False,False,False,False,101
4,369,1,1,5,6,5,1,1,False,False,False,False,False,False,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,1680,4,8,11,4,223,32,3,False,False,False,False,False,False,432
1316,1681,4,8,12,5,224,32,3,False,False,False,False,False,False,432
1317,1682,4,8,13,6,225,32,3,False,False,False,False,False,False,432
1318,1683,4,8,14,0,226,33,3,False,False,False,False,False,False,433


In [23]:
all_data = pd.merge(all_data, dates, on='date')

In [26]:
holidays.rename(columns={
    'type': 'holiday_type'
}, inplace=True)

In [30]:
all_data = pd.merge(all_data, holidays, on='date')

In [33]:
stores.rename(columns={
    'city': 'store_city',
    'type': 'store_type',
    'cluster': 'store_cluster',
}, inplace=True)

In [36]:
all_data = pd.merge(all_data, stores, on='store_id')

# Training

In [39]:
cat_cols = [
    'store_id',
    'category_id',
]

In [42]:
cols_to_drop = ['is_test']
copy_df = all_data[all_data['is_test'] == False].drop(columns=cols_to_drop)

x = copy_df.drop(columns=['target'])
y = copy_df['target']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state=42)

In [43]:
from catboost import CatBoostRegressor, Pool

cb_params = {
    'iterations': 10000,
    'early_stopping_rounds': 500,
    'learning_rate': 1e-2,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    # 'depth': 6,
    # 'l2_leaf_reg': 4,
    # 'bagging_temperature': 2,
    # 'random_strength': 1,
    # 'border_count': 255,
    # 'od_type': 'IncToDec',
    'verbose': 100,
    # 'subsample': 1.0,
    'random_seed': 42,
    'task_type': 'CPU',
}

In [45]:
train_pool = Pool(x_train, y_train, cat_features=cat_cols)
valid_pool = Pool(x_valid, y_valid, cat_features=cat_cols)

In [46]:
cb_model = CatBoostRegressor(**cb_params)
cb_model.fit(train_pool, eval_set=[train_pool, valid_pool])

0:	learn: 1299.5704145	test: 1299.5703527	test1: 1349.4380186	best: 1349.4380186 (0)	total: 231ms	remaining: 38m 32s
100:	learn: 830.7032026	test: 830.1997480	test1: 906.3100151	best: 906.3100151 (100)	total: 10.8s	remaining: 17m 41s
200:	learn: 663.4835719	test: 661.8217436	test1: 757.1119103	best: 757.1119103 (200)	total: 22.3s	remaining: 18m 9s
300:	learn: 590.3112096	test: 588.6419630	test1: 695.3800390	best: 695.3800390 (300)	total: 33.6s	remaining: 18m 4s
400:	learn: 568.8468801	test: 567.1838319	test1: 678.2885428	best: 678.2885428 (400)	total: 44.7s	remaining: 17m 50s
500:	learn: 555.0936035	test: 553.5234559	test1: 667.7872564	best: 667.7872564 (500)	total: 56.4s	remaining: 17m 49s
600:	learn: 543.7546743	test: 542.2606718	test1: 659.5130448	best: 659.5130448 (600)	total: 1m 9s	remaining: 18m 4s
700:	learn: 533.5435914	test: 531.6697239	test1: 652.4847219	best: 652.4847219 (700)	total: 1m 22s	remaining: 18m 11s
800:	learn: 524.5421607	test: 522.3612259	test1: 646.4965559	best:

KeyboardInterrupt: 