In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 16, 12
from sklearn import preprocessing
import pandas as pd
pd.set_option('display.max_columns', 500)
import pickle
import seaborn as sns
import datetime
import multiprocessing
import statsmodels.api as sm
sns.set_style("dark")

In [2]:
# read and create single dataframe

df_train = pd.read_csv('./../data/raw/train.csv', 
                       infer_datetime_format=True, parse_dates=['Date'],
                       index_col=['Store', 'Date'])
df_test = pd.read_csv('./../data/raw/test.csv', 
                      infer_datetime_format=True, parse_dates=['Date'],
                      index_col=['Store', 'Date'])
df_stores = pd.read_csv('./../data/raw/store.csv', index_col='Store')

# df_train['IsTest'] = False
# df_test['IsTest'] = True
# df_test['IsValid'] = False


df = df_train.append(df_test, ignore_index=False)
df = df[['Id', 'Sales', 'Customers', 'Open', 'Promo', 'SchoolHoliday', 'StateHoliday', 'DayOfWeek']] #, 'IsTest']]
df['Year'] = df.index.to_series().apply(lambda r: r[1].year)
df['Month'] = df.index.to_series().apply(lambda r: r[1].month)
df = df.join(pd.get_dummies(df['Year'], prefix='Year')).drop('Year', 1)
df = df.join(pd.get_dummies(df['Month'], prefix='Month')).drop('Month', 1)

df_stores = df_stores.join(pd.get_dummies(df_stores['StoreType'], prefix='StoreType')).drop('StoreType', 1)
df_stores = df_stores.join(pd.get_dummies(df_stores['Assortment'], prefix='Assortment')).drop('Assortment', 1)
df_stores = df_stores.join(pd.get_dummies(df_stores['PromoInterval'], prefix='PromoInterval')).drop('PromoInterval', 1)


df_stores.loc[df_stores['CompetitionDistance'].isnull(), 'CompetitionDistance'] = df_stores['CompetitionDistance'].max()
df_stores['LogCompetitionDistance'] = np.log(df_stores['CompetitionDistance'])
df_stores = df_stores[['StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d', 
                       'Assortment_a', 'Assortment_b', 'Assortment_c', 
                       'CompetitionDistance', 'LogCompetitionDistance', 
                       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
                       'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 
                       'PromoInterval_Feb,May,Aug,Nov', 'PromoInterval_Jan,Apr,Jul,Oct', 
                       'PromoInterval_Mar,Jun,Sept,Dec']]

df['Store'] = df.index.to_series().apply(lambda r: r[0])
df = df.join(df_stores, on='Store').drop('Store', 1)


def transform_competition_open_since_year_diff(i):
    if np.isnan(i):
        return np.nan
    if i >= 10:
        return 10
    if i >= 5:
        return 5
    if i >= 3:
        return 3
    return i
    
df = df.join(
    pd.get_dummies(df['CompetitionOpenSinceYear'].\
                        apply(lambda i: np.nan if np.isnan(i) else 2015 - i).\
                        apply(transform_competition_open_since_year_diff),
                   prefix='CompetitionOpenSinceYear')).\
    drop('CompetitionOpenSinceYear', 1)

df.loc[df['StateHoliday'] == 0, 'StateHoliday'] = '0'
df = df.join(pd.get_dummies(df['StateHoliday'], prefix='StateHoliday')).drop('StateHoliday', 1)

    
df = df.sortlevel(1)

df.to_pickle('./../data/df_base.pkl')


#-----------

df = df[pd.notnull(df['Sales'])].copy()
n_valids = 48
df_y_list = []
for store_id in set(map(lambda t: t[0], df.index)):
    dates = sorted(df.iloc[(store_id,)].index.tolist())
    max_date = max(dates)
    #print store_id, max_date
    valid_dates = map(lambda i: max_date - datetime.timedelta(days=i), range(n_valids))
    idx = df.index.isin(zip([store_id]*len(valid_dates), valid_dates))
    df_y_list.append(df.ix[idx, 'Sales'])
    df.ix[idx, 'Sales'] = np.nan

df.to_pickle('./../data/df_valid.pkl')

df_y_valid = reduce(lambda a, b: pd.concat([a, b], axis=0), df_y_list)

df_y_valid.to_pickle('./../data/df_y_valid.pkl')

  interactivity=interactivity, compiler=compiler, result=result)
