In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from itertools import product

In [2]:
df_train=pd.read_csv('sales_train.csv')
df_test=pd.read_csv('test.csv')

In [3]:
filt=(df_train['item_price']<25000)
df_train=df_train[filt]

In [4]:
filt=(df_train['item_price']>=0)
df_train=df_train[filt]

In [5]:
filt=(df_train['item_cnt_day']<1000)
df_train=df_train[filt]

In [6]:
sales=df_train
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})

#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
all_data['target']=np.clip(all_data['target'], 0, 20)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [16]:
np.mean(df_train["item_price"].values)

0.03474933529229418

In [12]:
x_min=min(df_train['item_price'].values)
x_max=max(df_train['item_price'].values)
df_train['item_price']=df_train['item_price'].apply(lambda t:((t-x_min)/(x_max-x_min)))

In [19]:
item_price_median = df_train.groupby('item_id')['item_price'].median()
all_data['item_price'] = all_data['item_id'].map(item_price_median)
all_data['item_price'].fillna(0.03475, inplace=True)

In [20]:
df_test['item_price'] = df_test['item_id'].map(item_price_median)
df_test['item_price'].fillna(0.03475, inplace=True)

In [22]:
encoding=all_data.groupby('item_id').size()
encoding=encoding/len(all_data)
all_data['F_enc_Iid']=df_train.item_id.map(encoding)

In [29]:
df_test['F_enc_Iid'] = df_test['item_id'].map(item_price_median)
df_test['F_enc_Iid'].fillna(0.0001, inplace=True)

In [32]:
df_test['date_block_num']=34

In [33]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,item_price,F_enc_Iid
139237,0,19,0,0.0,0.001117,0.000113
141477,0,27,0,0.0,0.05592,6.7e-05
144950,0,28,0,0.0,0.009958,4.6e-05
142643,0,29,0,0.0,0.05794,0.000108
138929,0,32,0,6.0,0.008838,8.6e-05


In [34]:
df_test.head()

Unnamed: 0,ID,shop_id,item_id,item_price,F_enc_Iid,date_block_num
0,0,5,5037,0.079961,0.079961,34
1,1,5,5320,0.03475,0.0001,34
2,2,5,5233,0.023978,0.023978,34
3,3,5,5232,0.023978,0.023978,34
4,4,5,5268,0.03475,0.0001,34


In [35]:
X = all_data[['shop_id','item_id','date_block_num','item_price','F_enc_Iid']]
y =all_data['target']

In [36]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(depth=7)
model.fit(X,y)

Learning rate set to 0.204234
0:	learn: 1.2100348	total: 1.12s	remaining: 18m 42s
1:	learn: 1.2029502	total: 2s	remaining: 16m 35s
2:	learn: 1.1978709	total: 2.84s	remaining: 15m 42s
3:	learn: 1.1949747	total: 3.65s	remaining: 15m 8s
4:	learn: 1.1920377	total: 4.41s	remaining: 14m 37s
5:	learn: 1.1901370	total: 5.24s	remaining: 14m 27s
6:	learn: 1.1881826	total: 6.08s	remaining: 14m 22s
7:	learn: 1.1856911	total: 6.97s	remaining: 14m 24s
8:	learn: 1.1836209	total: 7.75s	remaining: 14m 13s
9:	learn: 1.1822534	total: 8.7s	remaining: 14m 21s
10:	learn: 1.1797108	total: 9.51s	remaining: 14m 14s
11:	learn: 1.1784447	total: 10.4s	remaining: 14m 12s
12:	learn: 1.1760053	total: 11.2s	remaining: 14m 9s
13:	learn: 1.1753021	total: 12s	remaining: 14m 5s
14:	learn: 1.1746719	total: 12.8s	remaining: 13m 57s
15:	learn: 1.1737754	total: 13.5s	remaining: 13m 52s
16:	learn: 1.1725687	total: 14.3s	remaining: 13m 48s
17:	learn: 1.1710774	total: 15.1s	remaining: 13m 46s
18:	learn: 1.1703522	total: 16s	rem

<catboost.core.CatBoostRegressor at 0x27408dcff60>

In [37]:
Xx = df_test[['shop_id','item_id','date_block_num','item_price','F_enc_Iid']]
Xx.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_price,F_enc_Iid
0,5,5037,34,0.079961,0.079961
1,5,5320,34,0.03475,0.0001
2,5,5233,34,0.023978,0.023978
3,5,5232,34,0.023978,0.023978
4,5,5268,34,0.03475,0.0001


In [38]:
X.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_price,F_enc_Iid
139237,0,19,0,0.001117,0.000113
141477,0,27,0,0.05592,6.7e-05
144950,0,28,0,0.009958,4.6e-05
142643,0,29,0,0.05794,0.000108
138929,0,32,0,0.008838,8.6e-05


In [39]:
preds = model.predict(Xx)

In [40]:
output= pd.DataFrame({'ID': df_test.ID,'item_cnt_month': preds})

In [41]:
output.to_csv('sub7.csv', index=False)

In [None]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(depth=16)
model.fit(X,y)

Learning rate set to 0.204234
0:	learn: 1.2060735	total: 7.41s	remaining: 2h 3m 26s
