In [1]:
import numpy as np
import pandas as pd
from sklearn import *
import nltk, datetime
import timeit

train = pd.read_csv('../data/sales_train_v2.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')
items = pd.read_csv('../data/items.csv')
item_cats = pd.read_csv('../data/item_categories.csv')
shops = pd.read_csv('../data/shops.csv')
print('train:', train.shape, 'test:', test.shape)

# Data present on train dataset and not in the test dataset
[c for c in train.columns if c not in test.columns]



train: (2935849, 6) test: (214200, 3)


['date', 'date_block_num', 'item_price', 'item_cnt_day']

# All data formats

In [2]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [3]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [4]:
item_cats.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [5]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [6]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [7]:
submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [8]:
# Treat only ids as training data
items_id = items.drop(labels=['item_name'], axis = 1)
item_cats_id = item_cats.drop(labels=['item_category_name'], axis = 1)
shops_id = shops.drop(labels=['shop_name'], axis = 1)

In [9]:
# Formatting the date feature into a monthly data feature 
train['date'] = pd.to_datetime(train['date'], format='%d.%m.%Y')
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

# Grouping all daily item counts into monthly item counts
train = train.drop(['date','item_price'], axis=1)
train = train.groupby([c for c in train.columns if c not in ['item_cnt_day']], as_index=False)[['item_cnt_day']].sum()
train = train.rename(columns={'item_cnt_day':'item_cnt_month'})

In [10]:
# Monthly Mean
shop_item_monthly_mean = train[['shop_id','item_id','item_cnt_month']].groupby(['shop_id','item_id'], as_index=False).mean()
shop_item_monthly_mean = shop_item_monthly_mean.rename(columns={'item_cnt_month':'item_cnt_month_mean'})

#Last Month (Oct 2015)
shop_item_prev_month = train[train['date_block_num']==33][['shop_id','item_id','item_cnt_month']]
shop_item_prev_month = shop_item_prev_month.rename(columns={'item_cnt_month':'item_cnt_prev_month'})

In [11]:
# Add Mean Feature
train = pd.merge(train, shop_item_monthly_mean, how='left', on=['shop_id','item_id'])

#Add Previous Month Feature
train = pd.merge(train, shop_item_prev_month, how='left', on=['shop_id','item_id']).fillna(0.)

#Items features
train = pd.merge(train, items_id, how='left', on='item_id')

#Item Category features
train = pd.merge(train, item_cats_id, how='left', on='item_category_id')

#Shops features
train = pd.merge(train, shops_id, how='left', on='shop_id')

train.head()

Unnamed: 0,date_block_num,shop_id,item_id,month,year,item_cnt_month,item_cnt_month_mean,item_cnt_prev_month,item_category_id
0,0,0,32,1,2013,6.0,8.0,0.0,40
1,0,0,33,1,2013,3.0,3.0,0.0,37
2,0,0,35,1,2013,1.0,7.5,0.0,40
3,0,0,43,1,2013,1.0,1.0,0.0,40
4,0,0,51,1,2013,2.0,2.5,0.0,57


In [12]:
# Submitting test for the following month of training data
test['month'] = 11
test['year'] = 2015
test['date_block_num'] = 34

#Add Mean Feature
test = pd.merge(test, shop_item_monthly_mean, how='left', on=['shop_id','item_id']).fillna(0.)

#Add Previous Month Feature
test = pd.merge(test, shop_item_prev_month, how='left', on=['shop_id','item_id']).fillna(0.)

#Items features
test = pd.merge(test, items_id, how='left', on='item_id')

#Item Category features
test = pd.merge(test, item_cats_id, how='left', on='item_category_id')

#Shops features
test = pd.merge(test, shops_id, how='left', on='shop_id')

test['item_cnt_month'] = 0.
test.head()

Unnamed: 0,ID,shop_id,item_id,month,year,date_block_num,item_cnt_month_mean,item_cnt_prev_month,item_category_id,item_cnt_month
0,0,5,5037,11,2015,34,1.444444,0.0,19,0.0
1,1,5,5320,11,2015,34,0.0,0.0,55,0.0
2,2,5,5233,11,2015,34,2.0,1.0,19,0.0
3,3,5,5232,11,2015,34,1.0,0.0,23,0.0
4,4,5,5268,11,2015,34,0.0,0.0,20,0.0


In [13]:
col = [c for c in train.columns if c not in ['item_cnt_month']]

#Validation Hold Out Month
x_train = train[train['date_block_num']<=33]
y_train = np.log1p(x_train['item_cnt_month'].clip(0.,20.))
x_train = x_train[col]

x_test = train[train['date_block_num']==33]
y_test = np.log1p(x_test['item_cnt_month'].clip(0.,20.))
x_test = x_test[col]

In [14]:
regressor = ensemble.RandomForestRegressor(verbose=1, n_estimators=20, n_jobs=-1, warm_start = True)
start = timeit.default_timer()

regressor.fit(x_train, y_train)

stop = timeit.default_timer()
print ("duration: " + str(stop - start))

duration: 51.37965755070679


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   51.1s finished


In [15]:
start = timeit.default_timer()

print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test.clip(0.,20.), regressor.predict(x_test).clip(0.,20.))))
stop = timeit.default_timer()
print ("duration: " + str(stop - start))

RMSE: 0.011289108165540067
duration: 0.12677618266004487


[Parallel(n_jobs=8)]: Done  25 out of  25 | elapsed:    0.0s finished
