In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from itertools import product

In [2]:
df_train=pd.read_csv('sales_train.csv')
df_test=pd.read_csv('test.csv')

In [3]:
filt=(df_train['item_price']<25000)
df_train=df_train[filt]

In [4]:
filt=(df_train['item_price']>=0)
df_train=df_train[filt]

In [5]:
filt=(df_train['item_cnt_day']<1000)
df_train=df_train[filt]

In [6]:
df_train['date']= pd.to_datetime(df_train['date'],format='%d.%m.%Y')
df_train.set_index('date',inplace=True)


In [7]:
sales=df_train
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})

#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [8]:
all_data['target']=np.clip(all_data['target'], 0, 21)

In [14]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target
139237,0,19,0,0.0
141477,0,27,0,0.0
144950,0,28,0,0.0
142643,0,29,0,0.0
138929,0,32,0,6.0


In [13]:
df_test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num
0,0,5,5037,34
1,1,5,5320,34
2,2,5,5233,34
3,3,5,5232,34
4,4,5,5268,34


In [11]:
df_test['date_block_num']=34

In [12]:
df_test=df_test[['ID','shop_id','item_id','date_block_num']]


In [15]:
X = all_data[['shop_id','item_id','date_block_num']]
y =all_data['target']

In [16]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(depth=6)
model.fit(X,y)

Learning rate set to 0.204234
0:	learn: 1.2335382	total: 1.08s	remaining: 17m 58s
1:	learn: 1.2298760	total: 1.89s	remaining: 15m 43s
2:	learn: 1.2275984	total: 2.75s	remaining: 15m 15s
3:	learn: 1.2260656	total: 3.55s	remaining: 14m 43s
4:	learn: 1.2241153	total: 4.34s	remaining: 14m 24s
5:	learn: 1.2231950	total: 5.04s	remaining: 13m 54s
6:	learn: 1.2224168	total: 5.83s	remaining: 13m 46s
7:	learn: 1.2215217	total: 6.59s	remaining: 13m 37s
8:	learn: 1.2209151	total: 7.45s	remaining: 13m 39s
9:	learn: 1.2204969	total: 8.15s	remaining: 13m 27s
10:	learn: 1.2197814	total: 8.82s	remaining: 13m 13s
11:	learn: 1.2192903	total: 9.49s	remaining: 13m 1s
12:	learn: 1.2188255	total: 10.1s	remaining: 12m 47s
13:	learn: 1.2184825	total: 10.8s	remaining: 12m 39s
14:	learn: 1.2180403	total: 11.5s	remaining: 12m 34s
15:	learn: 1.2175538	total: 12.6s	remaining: 12m 54s
16:	learn: 1.2171663	total: 13.7s	remaining: 13m 10s
17:	learn: 1.2165758	total: 14.4s	remaining: 13m 5s
18:	learn: 1.2161493	total: 

<catboost.core.CatBoostRegressor at 0x223a7fe6ba8>

In [17]:
Xx = df_test[['shop_id','item_id','date_block_num']]
X.head()

Unnamed: 0,shop_id,item_id,date_block_num
139237,0,19,0
141477,0,27,0
144950,0,28,0
142643,0,29,0
138929,0,32,0


In [18]:
Xx.head()

Unnamed: 0,shop_id,item_id,date_block_num
0,5,5037,34
1,5,5320,34
2,5,5233,34
3,5,5232,34
4,5,5268,34


In [19]:
preds = model.predict(Xx)


In [20]:
output= pd.DataFrame({'ID': df_test.ID,'item_cnt_month': preds})

In [21]:
output.to_csv('sub4.csv', index=False)

In [23]:
output['item_cnt_month'].value_counts().sort_values()

 0.068278      4
-0.013847      4
 0.035228      4
-0.032149      4
 0.085259      4
-0.042485      4
 0.037895      4
 0.262605      4
 0.051612      4
 0.451214      4
-0.036066      4
-0.017537      4
 0.005823      4
 0.133072      4
 0.039918      4
-0.058425      4
 0.064095      4
 0.054749      4
 0.028718      4
-0.023644      4
 0.066154      4
 0.052677      4
 0.142407      4
-0.005790      4
-0.009492      4
 0.565428      4
 0.014772      4
 0.088372      4
 0.045781      4
 0.116108      4
            ... 
 0.278810    123
 0.412721    123
 0.276835    123
 0.300368    123
 0.242704    123
 0.336522    123
 0.579620    123
 0.273720    123
 0.290416    123
 0.204986    123
 0.333759    123
 1.054017    123
 0.228268    123
 0.233737    123
 0.491269    123
 0.222526    123
 0.371991    123
 0.338289    123
 0.233081    123
 0.288180    123
 0.227894    123
 0.270913    123
 0.271666    123
 0.534612    123
 0.269818    123
 0.336039    123
 0.270096    123
 0.349646    1