# Simple Linear Model

I create a simple linear model, using only some aggregated and lagged features. While it was originally meant for ensembling with final model, it ended up not being helpful there, so instead I use this models predictions as a feature for the more powerful model (below).

In [17]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.metrics import r2_score

## Feature Generation

Here I create the features I will use, along with their lagged values.

In [12]:
ext_agg_sales  = agg_sales.copy().drop('item_price', axis=1)
ext_agg_sales['total_shop_cnt_month'] = agg_sales.groupby(level=[0,1]).item_cnt_month.transform('sum')
ext_agg_sales['total_item_cnt_month'] = agg_sales.groupby(level=[0,2]).item_cnt_month.transform('sum')

In [13]:
ext_agg_sales.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_month,total_shop_cnt_month,total_item_cnt_month
Month,shop_id,item_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01,0,32,6.0,5578.0,299.0
2013-01,0,33,3.0,5578.0,61.0
2013-01,0,35,1.0,5578.0,78.0
2013-01,0,43,1.0,5578.0,2.0
2013-01,0,51,2.0,5578.0,13.0


In [14]:
shops = sorted(ext_agg_sales.index.levels[1])
idx_cols = ['Month', 'item_id']
cols_to_lag = ['item_cnt_month', 'total_shop_cnt_month', 'total_item_cnt_month']

In [15]:
to_previous_month_map = dict(zip(months[1:], months))
obj_month_map = dict(zip(months, months[1:]+[pd.Period('2015-11')]))

I will actually fit one model per shop, and then put together.

In [16]:
def get_single_shop_data(shopnum):
    lag_month_map = dict(zip(months, months[1:]))
    forward_month_map = dict(zip(months[1:], months))

    def rename_col(col, s):
        return f'{col}_lag_{s}'

    single_shop = ext_agg_sales.loc[pd.IndexSlice[:,shopnum,:],:].reset_index(level=1, drop=True).reset_index()

    shifted = [single_shop.copy().dropna().set_index(idx_cols)]
    for shift in [1,2,3,4,5,8,11]:
        single_shop_shift = single_shop.copy()
        month = single_shop.Month
        for s in range(1,shift+1):
            month = month.map(lag_month_map)
        single_shop_shift.Month = month
        single_shop_shift.columns = [rename_col(col, shift) if col in cols_to_lag else col for col in single_shop_shift.columns ]
        shifted.append(single_shop_shift.dropna().set_index(idx_cols))

    all_data_single_shop = pd.concat(shifted, axis=1).fillna(0)
    single_shop_target = single_shop.copy()
    single_shop_target.Month = single_shop_target.Month.map(forward_month_map)
    all_data_single_shop['target'] = single_shop_target.dropna().set_index(idx_cols).item_cnt_month
    all_data_single_shop = all_data_single_shop.loc[months[12:]] # remove first 12 months with incomplete features
    
    return all_data_single_shop

## Prediction

In [17]:
def get_single_shop_predictions(mod, all_data_single_shop):
    out_sample_preds = []
    for (train_month_thresh, test_month) in zip(months[12:-1], months[13:]):
        X_train, Y_train = (all_data_single_shop.loc[:train_month_thresh].drop('target', axis=1), all_data_single_shop.loc[:train_month_thresh, 'target'])
        X_test, Y_test = (all_data_single_shop.loc[[test_month]].drop('target', axis=1), all_data_single_shop.loc[[test_month], 'target'])
        if (len(X_test) == 0) or (len(X_train) == 0):
            continue
        fitted_model = mod.fit(X_train, Y_train.fillna(0))
        ytest_pred = fitted_model.predict(X_test)
        out_sample_preds.append(pd.Series(ytest_pred, index=X_test.index))
        #print(f'Test month: {test_month}, Test R-squared is: {r2_score(Y_test, ytest_pred)}' )
    if out_sample_preds:
        return pd.concat(out_sample_preds)
    else:
        return None


In [18]:
%%time
# mod = LinearRegression()
mod = ElasticNet(alpha=2, l1_ratio=0.25, max_iter=100000)
shops_pred = {}
for shop in shops:
    start = time.time()
    single_shop_data = get_single_shop_data(shop)
    preds =  get_single_shop_predictions(mod, single_shop_data)
    if preds is not None:
        shops_pred[shop] = preds
    # print(time.time()-start)
    

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Wall time: 5min 27s


In [19]:
shops_pred_df = pd.concat(shops_pred, names=['shop_id']).swaplevel(0,1).rename('Pred1')

In [20]:
shops_pred_df.head()

Month    shop_id  item_id
2014-02  2        30         0.645554
                  31         0.049581
                  32         0.283372
                  33         0.754955
                  97         0.137998
Name: Pred1, dtype: float64

## Conform to submission like df

For the purposes of consistent validation, I conform the prediction above to the submission like df already created. That df is the one that mimics the test set better.

In [21]:
temp_sub = submission_like_df.item_cnt_month.reset_index()
temp_sub.Month = temp_sub.Month.map(to_previous_month_map)
temp_sub = temp_sub[temp_sub.Month >= min(shops_pred_df.index.levels[0])].set_index(['Month', 'shop_id', 'item_id'])

In [22]:
model1_oos_predictions = shops_pred_df.reindex(temp_sub.index).fillna(0)

In [23]:
model1_oos_predictions.head()

Month    shop_id  item_id
2014-02  2        27         0.000000
                  28         0.000000
                  29         0.000000
                  30         0.645554
                  31         0.049581
Name: Pred1, dtype: float64

Include final prediction

In [24]:
final_pred = shops_pred_df.loc[months[-1]].reindex(test_set.set_index(['shop_id','item_id']).index).fillna(0)
final_pred = pd.concat([final_pred], keys=[months[-1]], names=['Month'])

Realign months so that it represents prediction month and not data month

In [25]:
model1_oos_predictions = pd.concat([model1_oos_predictions, final_pred]).reset_index()
model1_oos_predictions.Month = model1_oos_predictions.Month.map(obj_month_map)
model1_oos_predictions = model1_oos_predictions.set_index(['Month', 'shop_id', 'item_id']).sort_index()

In [26]:
model1_oos_predictions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Pred1
Month,shop_id,item_id,Unnamed: 3_level_1
2014-03,2,27,0.0
2014-03,2,28,0.0
2014-03,2,29,0.0
2014-03,2,30,0.645554
2014-03,2,31,0.049581


In [None]:
model1_oos_predictions.to_pickle('EN_model_predictions.p')

## Errors

In [28]:
from sklearn.metrics import mean_squared_error

In [29]:
def get_error(full_predictions_series, full_data):
    rmse = {}
    for month in sorted(full_predictions_series.index.levels[0])[:-1]:
        ypred = full_predictions_series.loc[month].sort_index()
        ytrue = full_data.loc[month].sort_index()
        rmse[month] = np.sqrt(mean_squared_error(ytrue, ypred))
    return pd.Series(rmse)

In [30]:
get_error(model1_oos_predictions.Pred1, submission_like_df.item_cnt_month)

2014-03    2.024727
2014-04    2.511645
2014-05    1.656804
2014-06    1.879623
2014-07    1.228892
2014-08    1.250089
2014-09    3.013957
2014-10    2.936116
2014-11    3.222223
2014-12    3.024958
2015-01    3.083645
2015-02    2.854308
2015-03    2.153976
2015-04    4.329503
2015-05    3.250949
2015-06    1.641582
2015-07    1.072869
2015-08    1.262157
2015-09    6.448914
2015-10    4.554570
Freq: M, dtype: float64

In [31]:
get_error(model1_oos_predictions.Pred1.clip(0,20), submission_like_df.item_cnt_month)

2014-03    3.035892
2014-04    3.119146
2014-05    2.407830
2014-06    2.600050
2014-07    2.266811
2014-08    2.427879
2014-09    3.692124
2014-10    3.698570
2014-11    4.064453
2014-12    4.876658
2015-01    4.155309
2015-02    1.642374
2015-03    1.597956
2015-04    4.484014
2015-05    3.677069
2015-06    2.080702
2015-07    1.900616
2015-08    1.992029
2015-09    6.712025
2015-10    5.217821
Freq: M, dtype: float64

In [103]:
%who

ElasticNet	 LinearRegression	 agg_sales	 aggregate_monthly	 cat_annotated_agg_sales	 categories	 cols_to_lag	 create_submission_like_df	 deque	 
ext_agg_sales	 final_pred	 get_error	 get_single_shop_data	 get_single_shop_predictions	 idx_cols	 items	 load_all	 load_sales	 
mean_squared_error	 mod	 model1_oos_predictions	 months	 np	 obj_month_map	 os	 pd	 plt	 
preds	 product	 r2_score	 sales	 seaborn	 shop	 shops	 shops_pred	 shops_pred_df	 
single_shop_data	 start	 submission_like_df	 temp_sub	 test_set	 time	 to_previous_month_map	 


In [None]:
del([ext_agg_sales, shops_pred, temp_sub, final_pred, preds, shops_pred_df, single_shop_data])
gc.collect()