In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import gc
import catboost as cb
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import train_test_split, KFold
import xgboost as xgb
from copy import deepcopy

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train['num_orders'] = np.log1p(train['num_orders'])
sample = pd.read_csv("sample_submission.csv")
train['train_or_test'] = 'train'
test['train_or_test'] = 'test'


In [3]:
psa = train.append(test).reset_index(drop=True)[train.columns]
psa = psa.sort_values(['center_id', 'meal_id', 'week']).reset_index(drop=True)
psa['checkout_price'] = np.log1p(psa['checkout_price'])
psa['base_price'] = np.log1p(psa['base_price'])
psa['diff_price'] = (psa['base_price'] - psa['checkout_price']) / psa['base_price']
psa['d1'] = (psa['diff_price'] < 0).astype(int) #dummy
psa['diff_period'] = (psa['checkout_price'].shift(1) - psa['checkout_price']).fillna(1)/ psa['checkout_price'].shift(1).fillna(1)

In [4]:
for _, r in psa.groupby(['center_id', 'meal_id'])['week'].first().reset_index().iterrows():
    psa.loc[(psa['center_id']==r['center_id']) & (psa['meal_id']==r['meal_id']) & (psa['week']==r['week']), 'diff_period'] = psa[(psa['center_id']==r['center_id']) & (psa['meal_id']==r['meal_id']) & (psa['week']!=r['week'])]['diff_period'].mean()

In [5]:
psa['diff_period'] = psa['diff_period'].fillna(0)
psa['price_increase'] = (psa['diff_period'] < 0).astype(int)

In [6]:
def rolling_mean(df, gpby_cols, target_col, windows, min_periods=2, 
                             shift=1, win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmean', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, 
                                                  min_periods=min_periods,
                                                  win_type=win_type).mean().values +\
            np.random.normal(scale=1.6, size=(len(df),))
    return df

def rolling_med(df, gpby_cols, target_col, windows, min_periods=2, 
                            shift=1, win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmed', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, 
                                                  min_periods=min_periods,
                                                  win_type=win_type).median().values +\
            np.random.normal(scale=1.6, size=(len(df),))
    return df


def exp_weighted_mean(df, gpby_cols, target_col, alpha=[0.9], shift=[1]):
    gpby = df.groupby(gpby_cols)
    for a in alpha:
        for s in shift:
            df['_'.join([target_col, 'lag', str(s), 'ewm', str(a)])] = \
                gpby[target_col].shift(s).ewm(alpha=a).mean().values
    return df





In [7]:
def llag(df, gpby_cols, target_col, lags):
    gpby = df.groupby(gpby_cols)
    for i in lags:
        df['_'.join([target_col, 'lag', str(i)])] = \
                gpby[target_col].shift(i).values + np.random.normal(scale=1.6, size=(len(df),))
    return df

In [8]:
psa = llag(psa, gpby_cols=['center_id','meal_id'], target_col='num_orders', 
                               lags=[10,11, 12])

psa = rolling_mean(psa, gpby_cols=['center_id','meal_id'], 
                                 target_col='num_orders', windows=[26,52,104], 
                                 min_periods=3, win_type='triang')

psa = rolling_med(psa, gpby_cols=['center_id','meal_id'], 
                                 target_col='num_orders', windows=[26,52,104], 
                                 min_periods=3, win_type=None)

psa = exp_weighted_mean(psa, gpby_cols=['center_id','meal_id'], 
                               target_col='num_orders', 
                               alpha=[0.5], 
                               shift=[10,11,12,13,14,15])

In [9]:
psa = psa[['id','week','center_id','meal_id','checkout_price','base_price','emailer_for_promotion','homepage_featured','num_orders','city_code','region_code','center_type','op_area','category','cuisine','train_or_test','diff_price','d1','diff_period','price_increase','num_orders_lag_10','num_orders_lag_11','num_orders_lag_12','num_orders_lag_10_ewm_0.5','num_orders_lag_11_ewm_0.5','num_orders_lag_12_ewm_0.5','num_orders_lag_13_ewm_0.5','num_orders_lag_14_ewm_0.5','num_orders_lag_15_ewm_0.5']]


In [10]:
trn = psa.loc[psa['train_or_test']=='train', :]
test = psa.loc[psa['train_or_test']=='test', :]

### CatBoost 

In [11]:
avoid_columns1 = ['id', 'num_orders', 'train_or_test', 'checkout_price', 'base_price', 'city_code', 'region_code', 'center_type', '']

In [12]:
features1 = [col for col in test.columns if col not in avoid_columns1]

In [13]:
features1


['week',
 'center_id',
 'meal_id',
 'emailer_for_promotion',
 'homepage_featured',
 'op_area',
 'category',
 'cuisine',
 'diff_price',
 'd1',
 'diff_period',
 'price_increase',
 'num_orders_lag_10',
 'num_orders_lag_11',
 'num_orders_lag_12',
 'num_orders_lag_10_ewm_0.5',
 'num_orders_lag_11_ewm_0.5',
 'num_orders_lag_12_ewm_0.5',
 'num_orders_lag_13_ewm_0.5',
 'num_orders_lag_14_ewm_0.5',
 'num_orders_lag_15_ewm_0.5']

In [14]:
ind1 = [1, 2, 3, 4, 6, 7, 9, 11]

In [15]:
CatBoost1  = cb.CatBoostRegressor(iterations=625, learning_rate=0.06, depth=8, l2_leaf_reg=17, loss_function='RMSE', random_seed=2018)

In [16]:
CatBoost1.fit(X=trn[features1], y=trn['num_orders'], cat_features=ind1, verbose=True)

0:	learn: 1.1600310	total: 339ms	remaining: 3m 31s
1:	learn: 1.1162154	total: 495ms	remaining: 2m 34s
2:	learn: 1.0743105	total: 680ms	remaining: 2m 20s
3:	learn: 1.0367711	total: 884ms	remaining: 2m 17s
4:	learn: 1.0013641	total: 1.06s	remaining: 2m 11s
5:	learn: 0.9694795	total: 1.25s	remaining: 2m 8s
6:	learn: 0.9392827	total: 1.45s	remaining: 2m 7s
7:	learn: 0.9112050	total: 1.63s	remaining: 2m 5s
8:	learn: 0.8849546	total: 1.83s	remaining: 2m 5s
9:	learn: 0.8611727	total: 2.04s	remaining: 2m 5s
10:	learn: 0.8397361	total: 2.23s	remaining: 2m 4s
11:	learn: 0.8185643	total: 2.4s	remaining: 2m 2s
12:	learn: 0.7995766	total: 2.57s	remaining: 2m
13:	learn: 0.7824783	total: 2.73s	remaining: 1m 59s
14:	learn: 0.7665379	total: 2.92s	remaining: 1m 58s
15:	learn: 0.7516778	total: 3.1s	remaining: 1m 58s
16:	learn: 0.7380342	total: 3.29s	remaining: 1m 57s
17:	learn: 0.7254369	total: 3.47s	remaining: 1m 56s
18:	learn: 0.7134286	total: 3.65s	remaining: 1m 56s
19:	learn: 0.7017875	total: 3.8s	re

158:	learn: 0.5144823	total: 27.2s	remaining: 1m 19s
159:	learn: 0.5143356	total: 27.4s	remaining: 1m 19s
160:	learn: 0.5140673	total: 27.6s	remaining: 1m 19s
161:	learn: 0.5139210	total: 27.8s	remaining: 1m 19s
162:	learn: 0.5137644	total: 27.9s	remaining: 1m 19s
163:	learn: 0.5136458	total: 28.1s	remaining: 1m 19s
164:	learn: 0.5134541	total: 28.2s	remaining: 1m 18s
165:	learn: 0.5131573	total: 28.4s	remaining: 1m 18s
166:	learn: 0.5129976	total: 28.6s	remaining: 1m 18s
167:	learn: 0.5126898	total: 28.7s	remaining: 1m 18s
168:	learn: 0.5125340	total: 28.9s	remaining: 1m 18s
169:	learn: 0.5121575	total: 29.1s	remaining: 1m 17s
170:	learn: 0.5119290	total: 29.3s	remaining: 1m 17s
171:	learn: 0.5118316	total: 29.4s	remaining: 1m 17s
172:	learn: 0.5116916	total: 29.6s	remaining: 1m 17s
173:	learn: 0.5115277	total: 29.8s	remaining: 1m 17s
174:	learn: 0.5113189	total: 29.9s	remaining: 1m 16s
175:	learn: 0.5111891	total: 30.1s	remaining: 1m 16s
176:	learn: 0.5109665	total: 30.2s	remaining: 

317:	learn: 0.4940204	total: 57.4s	remaining: 55.4s
318:	learn: 0.4939435	total: 57.5s	remaining: 55.2s
319:	learn: 0.4938447	total: 57.7s	remaining: 55s
320:	learn: 0.4937866	total: 58s	remaining: 54.9s
321:	learn: 0.4937072	total: 58.2s	remaining: 54.8s
322:	learn: 0.4936206	total: 58.4s	remaining: 54.6s
323:	learn: 0.4935578	total: 58.6s	remaining: 54.4s
324:	learn: 0.4934611	total: 58.8s	remaining: 54.3s
325:	learn: 0.4933038	total: 58.9s	remaining: 54.1s
326:	learn: 0.4932455	total: 59.1s	remaining: 53.8s
327:	learn: 0.4931645	total: 59.3s	remaining: 53.7s
328:	learn: 0.4930716	total: 59.5s	remaining: 53.5s
329:	learn: 0.4928863	total: 59.7s	remaining: 53.4s
330:	learn: 0.4928272	total: 59.9s	remaining: 53.2s
331:	learn: 0.4927483	total: 1m	remaining: 53s
332:	learn: 0.4926786	total: 1m	remaining: 52.8s
333:	learn: 0.4925912	total: 1m	remaining: 52.6s
334:	learn: 0.4925098	total: 1m	remaining: 52.4s
335:	learn: 0.4924188	total: 1m	remaining: 52.2s
336:	learn: 0.4923416	total: 1m	r

475:	learn: 0.4828837	total: 1m 25s	remaining: 26.9s
476:	learn: 0.4828139	total: 1m 26s	remaining: 26.7s
477:	learn: 0.4827786	total: 1m 26s	remaining: 26.5s
478:	learn: 0.4827276	total: 1m 26s	remaining: 26.3s
479:	learn: 0.4826640	total: 1m 26s	remaining: 26.2s
480:	learn: 0.4825693	total: 1m 26s	remaining: 26s
481:	learn: 0.4825128	total: 1m 26s	remaining: 25.8s
482:	learn: 0.4824769	total: 1m 27s	remaining: 25.6s
483:	learn: 0.4824418	total: 1m 27s	remaining: 25.4s
484:	learn: 0.4823786	total: 1m 27s	remaining: 25.2s
485:	learn: 0.4823119	total: 1m 27s	remaining: 25.1s
486:	learn: 0.4822721	total: 1m 27s	remaining: 24.9s
487:	learn: 0.4821942	total: 1m 27s	remaining: 24.7s
488:	learn: 0.4821543	total: 1m 28s	remaining: 24.5s
489:	learn: 0.4821204	total: 1m 28s	remaining: 24.3s
490:	learn: 0.4820395	total: 1m 28s	remaining: 24.2s
491:	learn: 0.4819905	total: 1m 28s	remaining: 24s
492:	learn: 0.4819178	total: 1m 28s	remaining: 23.8s
493:	learn: 0.4818728	total: 1m 29s	remaining: 23.

<catboost.core.CatBoostRegressor at 0x140d9848b20>

In [17]:
pred = CatBoost1.predict(test[features1])

In [18]:
pred = (np.exp(pred) - 1)

In [19]:
sub = pd.DataFrame(columns=['id', 'num_orders'])
sub['id'] = test['id']
sub['num_orders'] = pred
sub.to_csv('iii2.csv', index=False)