In [1]:
%store -r __importRegression

In [2]:
__importRegression



In [3]:
from itertools import product
import gc

In [4]:
sales = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
items = pd.read_csv("../input/items.csv")
categories = pd.read_csv("../input/item_categories.csv")
print (sales.shape, test.shape)

((2935849, 6), (214200, 3))


In [5]:
# Add date_block_num for test with value 34 (next month)
test['date_block_num'] = 34

In [6]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

In [7]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [8]:
all_data = downcast_dtypes(all_data)

del grid, gb 
gc.collect();

In [9]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item
139255,0,19,0,0.0,5578.0,1.0
141495,0,27,0,0.0,5578.0,7.0
144968,0,28,0,0.0,5578.0,8.0
142661,0,29,0,0.0,5578.0,4.0
138947,0,32,0,6.0,5578.0,299.0


In [10]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

In [11]:
cols_to_rename

['target', 'target_item', 'target_shop']

In [12]:
# lag months: 1 = last month sales; 12 = last year same month sales
shift_range = [1]

In [13]:
from tqdm import tqdm_notebook

In [14]:
for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)
    
    # Merge lag feature with train data
    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)
    
    # Merge lag feature with test data
    test = pd.merge(test, train_shift, on=index_cols, how='left').fillna(0)
    
del train_shift
gc.collect();




In [15]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1
0,0,19,0,0.0,5578.0,1.0,0.0,0.0,0.0
1,0,27,0,0.0,5578.0,7.0,0.0,0.0,0.0
2,0,28,0,0.0,5578.0,8.0,0.0,0.0,0.0
3,0,29,0,0.0,5578.0,4.0,0.0,0.0,0.0
4,0,32,0,6.0,5578.0,299.0,0.0,0.0,0.0


In [16]:
# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

print fit_cols
print to_drop_cols

['target_lag_1', 'target_item_lag_1', 'target_shop_lag_1']
['target_item', 'target_shop', 'target', 'date_block_num']


## Train / Validation split

In [17]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']

last_block = dates.max()
print('Test `date_block_num` is %d' % last_block)

Test `date_block_num` is 33


In [18]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'n_jobs': -1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

lgb = LGBMRegressor(**lgb_params)

In [19]:
# Moving window validation scheme.
# On each iteration, use last month for validation
validation_months = [33, 32, 31, 30, 29]

for last_month in validation_months:
    # Split train and validation data
    dates_train = dates[dates <  last_month]
    dates_test  = dates[dates == last_month]

    X_train = all_data.loc[dates <  last_month].drop(to_drop_cols, axis=1)
    X_test =  all_data.loc[dates == last_month].drop(to_drop_cols, axis=1)

    y_train = all_data.loc[dates <  last_month, 'target'].values
    y_test =  all_data.loc[dates == last_month, 'target'].values
    
    lgb.fit(X_train, y_train)

    pred_train = lgb.predict(X_train)
    pred_test = lgb.predict(X_test)
    
    ## R2 and RMSE score for each validation fold
    print('Month {0:d} Test R-2: {1:f}'.format(last_month, r2_score(y_test, pred_test)))
    print('Month {0:d} Test RMSE {1:f}'.format(last_month, np.sqrt(mean_squared_error(y_test, pred_test))))
    

Month 33 Test R-2: 0.081174
Month 33 Test RMSE 5.121307
Month 32 Test R-2: 0.035207
Month 32 Test RMSE 6.726196
Month 31 Test R-2: 0.274872
Month 31 Test RMSE 1.903877
Month 30 Test R-2: 0.277028
Month 30 Test RMSE 1.818607
Month 29 Test R-2: 0.316012
Month 29 Test RMSE 1.925877


In [20]:
del dates_train, dates_test, X_train, X_test, y_train, y_test, pred_train, pred_test
gc.collect()

412

## Prepare submission

In [21]:
## Take all train data
X_train_all = all_data.drop(to_drop_cols, axis=1)
y_train_all = all_data.target.values

In [25]:
X_train_all.target_lag_1.describe()

count    1.091385e+07
mean     3.158634e-01
std      3.155803e+00
min     -2.200000e+01
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.305000e+03
Name: target_lag_1, dtype: float64

In [29]:
test.sort(columns='ID')

  if __name__ == '__main__':


Unnamed: 0,ID,shop_id,item_id,date_block_num,target_lag_1,target_item_lag_1,target_shop_lag_1
0,0,5,5037,34,0.0,25.0,1052.0
1,1,5,5320,34,0.0,0.0,0.0
2,2,5,5233,34,1.0,42.0,1052.0
3,3,5,5232,34,0.0,28.0,1052.0
4,4,5,5268,34,0.0,0.0,0.0
5,5,5,5039,34,1.0,29.0,1052.0
6,6,5,5041,34,2.0,62.0,1052.0
7,7,5,5046,34,0.0,12.0,1052.0
8,8,5,5319,34,0.0,26.0,1052.0
9,9,5,5003,34,0.0,95.0,1052.0


In [30]:
test_id = test.pop('ID')
test.drop('date_block_num', axis=1, inplace=True)

In [35]:
test.head()

Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1
0,5,5037,0.0,25.0,1052.0
1,5,5320,0.0,0.0,0.0
2,5,5233,1.0,42.0,1052.0
3,5,5232,0.0,28.0,1052.0
4,5,5268,0.0,0.0,0.0


In [36]:
X_train_all.head()

Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1
0,0,19,0.0,0.0,0.0
1,0,27,0.0,0.0,0.0
2,0,28,0.0,0.0,0.0
3,0,29,0.0,0.0,0.0
4,0,32,0.0,0.0,0.0


In [37]:
lgb.fit(X_train_all, y_train_all)

pred_train_all = lgb.predict(X_train_all)
pred_test = lgb.predict(test)

## R2 and RMSE score for each validation fold
print('Train R-2: {1:f}'.format(last_month, r2_score(y_train_all, pred_train_all)))
print('Train RMSE {1:f}'.format(last_month, np.sqrt(mean_squared_error(y_train_all, pred_train_all))))

Train R-2: 0.158638
Train RMSE 3.134494


In [38]:
pred_test.shape

(214200,)

In [39]:
for i in range(len(pred_test)):
    if pred_test[i] > 20:
        pred_test[i] = 20
    if pred_test[i] < 0:
        pred_test[i] = 20

In [40]:
test_submit = pd.DataFrame({'ID': test_id, 'item_cnt_month': pred_test})
print test_submit.shape
test_submit.to_csv('lgbm_all.csv', index=False)
test_submit.head()

(214200, 2)


Unnamed: 0,ID,item_cnt_month
0,0,0.304195
1,1,0.304473
2,2,0.385339
3,3,0.315109
4,4,0.304473
