In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pandas as pd
pd.set_option('display.max_columns', 500)
import pickle
import seaborn as sns
import datetime
import multiprocessing
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, ElasticNetCV
sns.set_style("dark")
plt.rcParams['figure.figsize'] = 16, 12

In [2]:
df_test = pd.read_pickle('./../data/df_base.pkl')
df_valid = pd.read_pickle('./../data/df_valid.pkl')
df_y_valid = pd.read_pickle('./../data/df_y_valid.pkl')

In [4]:
def extract_past_sales(df, n_prev_days=14, n_prev_weeks=4):
    df = df.copy()

    date_offsets = list(set(range(1, n_prev_days + 1)).union(map(lambda x: x*7, range(1, n_prev_weeks + 1))))

    def get_prev_dates(base_date, offsets):
        return dict(map(lambda i: (i, base_date - datetime.timedelta(days=i)), offsets))

    date_offsets_cols = dict(map(lambda i: (i, 'PrevDaySale_%i' % i), date_offsets))

    def add_day_offsets_columns(df, day_offsets):
        df = df.copy()
        for i in day_offsets:
            df[date_offsets_cols[i]] = np.nan
        return df

    #df_tmp = add_day_offsets_columns(df_valid, date_offsets)
    #df = df_valid.copy()

    store_date_sales_cache = df['Sales'].to_dict()

    prev_sales = []
    for store_id, date in store_date_sales_cache.keys():
        row = []
        for offset in date_offsets:
            prev_date = date - datetime.timedelta(days=offset)
            row.append(store_date_sales_cache[(store_id, prev_date)] \
                           if (store_id, prev_date) in store_date_sales_cache else np.nan)
        prev_sales.append([store_id, date] + row)

    df_past = pd.DataFrame.from_dict(prev_sales)
    df_past.columns = ['Store', 'Date'] + map(lambda i: date_offsets_cols[date_offsets[i]], range(len(list(date_offsets))))
    df_past = df_past.set_index(['Store', 'Date'])

    df = df.join(df_past, how='outer')
    return df

In [5]:
n_prev_days=31
n_prev_weeks=12
df_test = extract_past_sales(df_test, n_prev_days=n_prev_days, n_prev_weeks=n_prev_weeks)
df_valid = extract_past_sales(df_valid, n_prev_days=n_prev_days, n_prev_weeks=n_prev_weeks)

In [6]:
df_test.to_pickle('./../data/df_test_past_d%i_w%i.pkl' % (n_prev_days, n_prev_weeks))
df_valid.to_pickle('./../data/df_valid_past_d%i_w%i.pkl' % (n_prev_days, n_prev_weeks))