In [41]:
import os, time
from collections import deque
from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn

%matplotlib inline

In [42]:
def load_sales(parse_date = True):
    data_fol = 'data'
    sales = pd.read_csv(os.path.join(data_fol, 'sales_train.csv'))
    if parse_date:
        date_col = pd.to_datetime(sales.date, format='%d.%m.%Y')
        sales.date = date_col
        sales['Month'] = sales.date.dt.to_period('M')
    return sales

def aggregate_monthly(sales_df):
    assert 'Month' in sales_df.columns, 'Month column not found'
    return (sales_df
            .groupby(['Month', 'shop_id', 'item_id'])
            .agg({'item_cnt_day': np.sum, 'item_price': np.mean})
            .rename(columns={'item_cnt_day':'item_cnt_month'})
           )

In [43]:
def load_all():
    sales = load_sales()
    agg_sales = aggregate_monthly(sales)
    items = pd.read_csv('data/items.csv')
    categories = pd.read_csv('data/item_categories.csv')
    test_set = pd.read_csv('data/test.csv')
    return sales, agg_sales, items, categories, test_set

In [44]:
def create_submission_like_df():
    months = sorted(agg_sales.index.levels[0])
    submission_like_data  = {}
    for month in months:    
        month_data = agg_sales.loc[month]
        shops = set(month_data.index.get_level_values(0))
        item_ids = set(month_data.index.get_level_values(1))

        new_idx = pd.Index(product(shops, item_ids), name=month_data.index.name)
        submission_like_data[month] = month_data.reindex(new_idx).fillna({'item_cnt_month':0, 'item_price':-1})
        
    submission_like_df = pd.concat(submission_like_data)
    submission_like_df.index.names = agg_sales.index.names
    
    return submission_like_df

In [45]:
sales, agg_sales, items, categories, test_set = load_all()

In [46]:
agg_sales = agg_sales.drop(6066, level=2)

In [64]:
agg_sales.loc[agg_sales.item_price <= 0.1, 'item_price'] = np.nan
agg_sales['item_price'] = agg_sales.groupby(level=2).item_price.transform(lambda x: x.fillna(x.mean()))

In [65]:
submission_like_df = create_submission_like_df()

In [66]:
months = sorted(agg_sales.index.levels[0])

In [90]:
ext_agg_sales  = agg_sales.copy().drop('item_price', axis=1)
ext_agg_sales['total_shop_cnt_month'] = agg_sales.groupby(level=[0,1]).item_cnt_month.transform('sum')
ext_agg_sales['total_item_cnt_month'] = agg_sales.groupby(level=[0,2]).item_cnt_month.transform('sum')

In [92]:
ext_agg_sales.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_month,total_shop_cnt_month,total_item_cnt_month
Month,shop_id,item_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01,0,32,6.0,5578.0,299.0
2013-01,0,33,3.0,5578.0,61.0
2013-01,0,35,1.0,5578.0,78.0
2013-01,0,43,1.0,5578.0,2.0
2013-01,0,51,2.0,5578.0,13.0


In [93]:
shops = sorted(ext_agg_sales.index.levels[1])

In [97]:
idx_cols = ['Month', 'item_id']
cols_to_lag = ['item_cnt_month', 'total_shop_cnt_month', 'total_item_cnt_month']

In [98]:
shop = shops[10]

In [165]:
shop

10

In [172]:
lag_month_map

{Period('2013-01', 'M'): Period('2013-02', 'M'),
 Period('2013-02', 'M'): Period('2013-03', 'M'),
 Period('2013-03', 'M'): Period('2013-04', 'M'),
 Period('2013-04', 'M'): Period('2013-05', 'M'),
 Period('2013-05', 'M'): Period('2013-06', 'M'),
 Period('2013-06', 'M'): Period('2013-07', 'M'),
 Period('2013-07', 'M'): Period('2013-08', 'M'),
 Period('2013-08', 'M'): Period('2013-09', 'M'),
 Period('2013-09', 'M'): Period('2013-10', 'M'),
 Period('2013-10', 'M'): Period('2013-11', 'M'),
 Period('2013-11', 'M'): Period('2013-12', 'M'),
 Period('2013-12', 'M'): Period('2014-01', 'M'),
 Period('2014-01', 'M'): Period('2014-02', 'M'),
 Period('2014-02', 'M'): Period('2014-03', 'M'),
 Period('2014-03', 'M'): Period('2014-04', 'M'),
 Period('2014-04', 'M'): Period('2014-05', 'M'),
 Period('2014-05', 'M'): Period('2014-06', 'M'),
 Period('2014-06', 'M'): Period('2014-07', 'M'),
 Period('2014-07', 'M'): Period('2014-08', 'M'),
 Period('2014-08', 'M'): Period('2014-09', 'M'),
 Period('2014-09', '

In [173]:
forward_month_map

{Period('2013-02', 'M'): Period('2013-01', 'M'),
 Period('2013-03', 'M'): Period('2013-02', 'M'),
 Period('2013-04', 'M'): Period('2013-03', 'M'),
 Period('2013-05', 'M'): Period('2013-04', 'M'),
 Period('2013-06', 'M'): Period('2013-05', 'M'),
 Period('2013-07', 'M'): Period('2013-06', 'M'),
 Period('2013-08', 'M'): Period('2013-07', 'M'),
 Period('2013-09', 'M'): Period('2013-08', 'M'),
 Period('2013-10', 'M'): Period('2013-09', 'M'),
 Period('2013-11', 'M'): Period('2013-10', 'M'),
 Period('2013-12', 'M'): Period('2013-11', 'M'),
 Period('2014-01', 'M'): Period('2013-12', 'M'),
 Period('2014-02', 'M'): Period('2014-01', 'M'),
 Period('2014-03', 'M'): Period('2014-02', 'M'),
 Period('2014-04', 'M'): Period('2014-03', 'M'),
 Period('2014-05', 'M'): Period('2014-04', 'M'),
 Period('2014-06', 'M'): Period('2014-05', 'M'),
 Period('2014-07', 'M'): Period('2014-06', 'M'),
 Period('2014-08', 'M'): Period('2014-07', 'M'),
 Period('2014-09', 'M'): Period('2014-08', 'M'),
 Period('2014-10', '

In [226]:
lag_month_map = dict(zip(months, months[1:]))
forward_month_map = dict(zip(months[1:], months))

def rename_col(col, s):
    return f'{col}_lag_{s}'

single_shop = ext_agg_sales.loc[pd.IndexSlice[:,shop,:],:].reset_index(level=1, drop=True).reset_index()

shifted = [single_shop.copy().dropna().set_index(idx_cols)]
for shift in [1,2,3,4,5,8,11]:
    single_shop_shift = single_shop.copy()
    month = single_shop.Month
    for s in range(1,shift+1):
        month = month.map(lag_month_map)
    single_shop_shift.Month = month
    single_shop_shift.columns = [rename_col(col, shift) if col in cols_to_lag else col for col in single_shop_shift.columns ]
    shifted.append(single_shop_shift.dropna().set_index(idx_cols))
    
all_data_single_shop = pd.concat(shifted, axis=1).fillna(0)
single_shop_target = single_shop.copy()
single_shop_target.Month = single_shop_target.Month.map(forward_month_map)
all_data_single_shop['target'] = single_shop_target.dropna().set_index(idx_cols).item_cnt_month
all_data_single_shop = all_data_single_shop.loc[months[12:]] # remove first 12 months with incomplete features

In [227]:
all_data_single_shop.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_cnt_month,total_shop_cnt_month,total_item_cnt_month,item_cnt_month_lag_1,total_shop_cnt_month_lag_1,total_item_cnt_month_lag_1,item_cnt_month_lag_2,total_shop_cnt_month_lag_2,total_item_cnt_month_lag_2,item_cnt_month_lag_3,...,item_cnt_month_lag_5,total_shop_cnt_month_lag_5,total_item_cnt_month_lag_5,item_cnt_month_lag_8,total_shop_cnt_month_lag_8,total_item_cnt_month_lag_8,item_cnt_month_lag_11,total_shop_cnt_month_lag_11,total_item_cnt_month_lag_11,target
Month,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015-10,22092,2.0,428.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2015-10,22102,2.0,428.0,227.0,0.0,0.0,0.0,1.0,442.0,177.0,2.0,...,1.0,466.0,604.0,0.0,0.0,0.0,0.0,0.0,0.0,
2015-10,22104,0.0,0.0,0.0,0.0,0.0,0.0,1.0,442.0,17.0,0.0,...,1.0,466.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,
2015-10,22163,2.0,428.0,26.0,0.0,0.0,0.0,1.0,442.0,29.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2015-10,22167,1.0,428.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [145]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [146]:
lr = LinearRegression()

In [166]:
all_data_single_shop

Unnamed: 0_level_0,Unnamed: 1_level_0,item_cnt_month_lag_1,total_shop_cnt_month_lag_1,total_item_cnt_month_lag_1,item_cnt_month_lag_2,total_shop_cnt_month_lag_2,total_item_cnt_month_lag_2,item_cnt_month_lag_3,total_shop_cnt_month_lag_3,total_item_cnt_month_lag_3,item_cnt_month_lag_4,...,item_cnt_month_lag_6,total_shop_cnt_month_lag_6,total_item_cnt_month_lag_6,item_cnt_month_lag_9,total_shop_cnt_month_lag_9,total_item_cnt_month_lag_9,item_cnt_month_lag_12,total_shop_cnt_month_lag_12,total_item_cnt_month_lag_12,target
Month,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014-01,27,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1013.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,935.0,7.0,0.0
2014-01,31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,756.0,58.0,0.0,0.0,0.0,0.0
2014-01,32,1.0,1340.0,89.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2014-01,33,2.0,1340.0,42.0,1.0,1016.0,15.0,1.0,1013.0,33.0,0.0,...,1.0,800.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2014-01,52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-10,22102,0.0,0.0,0.0,1.0,442.0,177.0,2.0,449.0,216.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2015-10,22104,0.0,0.0,0.0,1.0,442.0,17.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-10,22162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,594.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-10,22163,0.0,0.0,0.0,1.0,442.0,29.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [162]:
out_sample_preds = []
for (train_month_thresh, test_month) in zip(months[12:-1], months[13:]):
    X_train, Y_train = (all_data_single_shop.loc[:train_month_thresh].drop('target', axis=1), all_data_single_shop.loc[:train_month_thresh, 'target'])
    X_test, Y_test = (all_data_single_shop.loc[[test_month]].drop('target', axis=1), all_data_single_shop.loc[[test_month], 'target'])
    fitted_model = lr.fit(X_train, Y_train)
    ytest_pred = fitted_model.predict(X_test)
    out_sample_preds.append(pd.Series(ytest_pred, index=X_test.index))
    print(f'Test month: {test_month}, Test R-squared is: {r2_score(Y_test, ytest_pred)}' )
    

Test month: 2014-02, Test R-squared is: 0.6889468513556045
Test month: 2014-03, Test R-squared is: 0.6018323027881685
Test month: 2014-04, Test R-squared is: 0.8302516094064425
Test month: 2014-05, Test R-squared is: 0.7809400035846179
Test month: 2014-06, Test R-squared is: 0.20203977440105825
Test month: 2014-07, Test R-squared is: 0.7864729705916236
Test month: 2014-08, Test R-squared is: 0.7433564847871064
Test month: 2014-09, Test R-squared is: 0.7717487630039762
Test month: 2014-10, Test R-squared is: 0.7901173080811082
Test month: 2014-11, Test R-squared is: 0.7511181181254152
Test month: 2014-12, Test R-squared is: 0.7549754160104979
Test month: 2015-01, Test R-squared is: 0.879999268114221
Test month: 2015-02, Test R-squared is: 0.0
Test month: 2015-03, Test R-squared is: 0.6806521431915716
Test month: 2015-04, Test R-squared is: 0.8350864209778478
Test month: 2015-05, Test R-squared is: 0.2436403807164783
Test month: 2015-06, Test R-squared is: 0.03226624141037626
Test month:

In [163]:
pd.concat(out_sample_preds)

Month    item_id
2014-02  27         0.075104
         30         0.366477
         31         0.316648
         32         0.409913
         33         0.256523
                      ...   
2015-10  22102      0.950501
         22104      0.139626
         22162      0.031931
         22163      0.110514
         22167      0.048802
Length: 44214, dtype: float64