In [1]:
%store -r __importRegression

In [2]:
__importRegression



In [3]:
from itertools import product
import gc

In [4]:
sales = pd.read_csv('../input/train.csv')
#test = pd.read_csv('../input/test.csv')
#items = pd.read_csv("../input/items.csv")
#categories = pd.read_csv("../input/item_categories.csv")

In [5]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})

#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]

#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)

#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

In [6]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [7]:
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

In [8]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target
139255,0,19,0,0.0
141495,0,27,0,0.0
144968,0,28,0,0.0
142661,0,29,0,0.0
138947,0,32,0,6.0


## Train / Validation split

In [9]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']

last_block = dates.max()
print('Test `date_block_num` is %d' % last_block)

Test `date_block_num` is 33


In [10]:
to_drop_cols = ['date_block_num']

In [11]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'n_jobs': -1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

lgb = LGBMRegressor(**lgb_params)

In [12]:
# Moving window validation scheme.
# On each iteration, use last month for validation
validation_months = [33, 32, 31, 30, 29]

for last_month in validation_months:
    # Split train and validation data
    dates_train = dates[dates <  last_month]
    dates_test  = dates[dates == last_month]

    X_train = all_data.loc[dates <  last_month].drop(to_drop_cols, axis=1)
    X_test =  all_data.loc[dates == last_month].drop(to_drop_cols, axis=1)

    y_train = X_train.target.values
    y_test =  X_test.target.values
    
    lgb.fit(X_train, y_train)

    pred_train = lgb.predict(X_train)
    pred_test = lgb.predict(X_test)
    
    ## R2 and RMSE score for each validation fold
    print('Month {0:d} Test R-2: {1:f}'.format(last_month, r2_score(y_test, pred_test)))
    print('Month {0:d} Test RMSE {1:f}'.format(last_month, np.sqrt(mean_squared_error(y_test, pred_test))))
    

Month 33 Test R-2: 0.145638
Month 33 Test RMSE 4.938386
Month 32 Test R-2: 0.188912
Month 32 Test RMSE 6.167177
Month 31 Test R-2: 0.380407
Month 31 Test RMSE 1.759887
Month 30 Test R-2: 0.416311
Month 30 Test RMSE 1.634063
Month 29 Test R-2: 0.363897
Month 29 Test RMSE 1.857240


## Prepare submission

In [13]:
## Take all train data
X_train_all = all_data.drop(to_drop_cols, axis=1)
y_train_all = X_train_all.pop('target')

In [14]:
X_train_all.head()

Unnamed: 0,shop_id,item_id
139255,0,19
141495,0,27
144968,0,28
142661,0,29
138947,0,32


In [15]:
test = pd.read_csv('../input/test.csv')

In [16]:
test_id = test.pop('ID')

In [17]:
test.head()

Unnamed: 0,shop_id,item_id
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [18]:
lgb.fit(X_train_all, y_train_all)

pred_train_all = lgb.predict(X_train_all)
pred_test = lgb.predict(test)

## R2 and RMSE score for each validation fold
print('Train R-2: {1:f}'.format(last_month, r2_score(y_train_all, pred_train_all)))
print('Train RMSE {1:f}'.format(last_month, np.sqrt(mean_squared_error(y_train_all, pred_train_all))))

Train R-2: 0.004286
Train RMSE 3.409912


In [19]:
pred_test.shape

(214200,)

In [20]:
for i in range(len(pred_test)):
    if pred_test[i] > 20:
        pred_test[i] = 20
    if pred_test[i] < 0:
        pred_test[i] = 20

In [21]:
test_submit = pd.DataFrame({'ID': test_id, 'item_cnt_month': pred_test})
print test_submit.shape
test_submit.to_csv('lgbm_all_train_totalSales.csv', index=False)
test_submit.head()

(214200, 2)


Unnamed: 0,ID,item_cnt_month
0,0,0.338577
1,1,0.363624
2,2,0.315495
3,3,0.315495
4,4,0.363624
