In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pandas as pd
pd.set_option('display.max_columns', 500)
import pickle
import seaborn as sns
import datetime
import multiprocessing
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.cross_validation import KFold
import xgboost as xgb
sns.set_style("dark")
plt.rcParams['figure.figsize'] = 16, 12

In [2]:
df_test = pd.read_pickle('./../data/df_test_past_d31_w12.pkl')
df_valid = pd.read_pickle('./../data/df_valid_past_d31_w12.pkl')
df_y_valid = pd.read_pickle('./../data/df_y_valid.pkl')

n_prev_days=31
n_prev_weeks=12
date_offsets = list(set(range(1, n_prev_days + 1)).union(map(lambda x: x*7, range(1, n_prev_weeks + 1))))
date_offsets_cols = dict(map(lambda i: (i, 'PrevDaySale_%i' % i), date_offsets))

def get_prev_dates(base_date, offsets):
    return dict(map(lambda i: (i, base_date - datetime.timedelta(days=i)), offsets))

def update_row(df, store_id, date, offsets, date_offsets_cols):
    prev_dates = get_prev_dates(date, offsets)
    for offset, prev_date in prev_dates.items():
        prev_sale = store_date_sales_cache[(store_id, prev_date)] if (store_id, prev_date) in store_date_sales_cache else np.nan
        #print '->>> ', offset, store_id, prev_date, prev_sale 
        df.loc[(df.index.get_level_values(0) == store_id) & \
              (df.index.get_level_values(1) == date), date_offsets_cols[offset]] = prev_sale    
    return df

def filter_null_past(df):
    df = df.copy()
    prev_day_sales_cols = df.columns[map(lambda c: c.startswith('PrevDaySale_'), df.columns)]
    return df[~((df[prev_day_sales_cols].isnull().sum(axis=1) > 0) & (pd.notnull(df['Sales'])))]

df_valid = filter_null_past(df_valid)
df_test = filter_null_past(df_test)

In [3]:
def get_regressors(df):
    regressors = []
    for c in df.columns:
        if pd.isnull(df_valid[c]).sum() == 0:
            regressors.append(c)
    regressors.extend(df.columns[map(lambda c: c.startswith('PrevDaySale_'), df.columns)])
    regressors.remove('Customers')
    return regressors

def predict_row(row, models, regressors):
    if pd.isnull(row['Open']):
        row = row.copy()
        row.loc['Open'] = 1.0
    if int(row['Open']) == 0:
        return 0.0
    return np.mean(np.array(map(lambda m: m.predict(row[regressors]), models)))

def predict_df(df, fitted_models, verbose=True):
    df = df.copy()
    store_date_sales_cache = df['Sales'].to_dict()
    null_sales_series = pd.isnull(df['Sales'])

    stores = null_sales_series.index.get_level_values(0).unique()
    y_pred = []
    for store_id in stores:
        if verbose:
            print store_id
        start_time = datetime.datetime.now()
        dates = sorted(null_sales_series[
                (null_sales_series.index.get_level_values(0) == store_id) &
                (null_sales_series)
            ].index.get_level_values(1))
        for date in dates:
            row = df.loc[(store_id, date)]
            pred = predict_row(row, fitted_models, regressors)
            y_pred.append((store_id, date, pred))
            store_date_sales_cache[(store_id, date)] = pred

            for offset in date_offsets_cols.keys():
                future_date = date + datetime.timedelta(days=offset)
                if (store_id, future_date) not in df.index:
                    break
                df.loc[(store_id, future_date), date_offsets_cols[offset]] = pred
        if verbose:
            print (datetime.datetime.now() - start_time).total_seconds()
    return y_pred
    
def validate(y_pred, df_y_valid):
    y_valid = df_y_valid.to_dict()
    y = []
    y_hat = []
    for store_id, date, v in y_pred:
        y.append(y_valid[(store_id, date)])
        y_hat.append(v)
    y = np.array(y)
    y_hat = np.array(y_hat)
    return RMSPE(y, y_hat)
#validate(y_pred, df_y_valid)


def test_merge(df, y_pred):
    dict_id = df.loc[pd.notnull(df['Id']), 'Id'].to_dict()
    dict_open = df.loc[pd.notnull(df['Id']), 'Open'].to_dict()
    y_pred_dict = {}
    for store_id, date, pred in y_pred:
        y_pred_dict[store_id, date] = pred
    res = {}
    for store_id, date in dict_id.keys():
        res[int(dict_id[store_id, date])] = y_pred_dict[store_id, date] * dict_open[store_id, date]
    return res


def write_submit(d, fname='submit.csv'):
    with open(fname, 'w') as f:
        f.write('"Id","Sales"\n')
        for i in d.keys():
            f.write('%i,%0.8f\n' % (i, 0.0 if np.isnan(d[i]) else d[i]))

In [4]:
def RMSPE(t, y):
    i = t > 0
    t = t.copy()[i]
    y = y.copy()[i]
    return np.sqrt((((t - y)/t)**2).sum()/float(y.shape[0]))

def xgb_RMSPE(y_predicted, xgbmat_train):
    return 'RMSPE', RMSPE(xgbmat_train.get_label(), y_predicted)

def RMSPE_sq(t, y):
    i = t > 0
    t = t.copy()[i]
    y = y.copy()[i]
    return (((t - y)/t)**2).sum()/float(y.shape[0])

def xgb_RMSPE_sq(y_predicted, xgbmat_train):
    return 'RMSPE_sq', RMSPE_sq(xgbmat_train.get_label(), y_predicted)

def d_RMSPE_sq(t, y):
    t = t.astype(np.float)
    x = (2/(t.shape[0]*(t**2)))*(y/t - 1)
    x[np.isinf(x)] = 0
    return x

def dd_RMSPE_sq(t, y):
    t = t.astype(np.float)
    x = 2/(t.shape[0]*(t**2))
    x[np.isinf(x)] = 0
    return x

def xgb_obj_RMSPE_sq(y_predicted, xgbmat_train):
    y = y_predicted
    t = xgbmat_train.get_label()
    return d_RMSPE_sq(t, y), dd_RMSPE_sq(t, y)

In [5]:
def train(df):
    X_train = df.ix[pd.notnull(df['Sales']), regressors].as_matrix()
    y_train = df.ix[pd.notnull(df['Sales']), 'Sales'].values
    fitted_models = []
    for i_first, i_second in KFold(y_train.shape[0], n_folds=5):
        model = xgb.XGBRegressor(
            max_depth=12, learning_rate=0.1, n_estimators=1000,
            silent=False, objective="reg:linear",
            nthread=-1, gamma=0,  subsample=1, colsample_bytree=1)
        model = model.fit(X_train[i_first, :], y_train[i_first],
                          #eval_metric='rmse',
                          eval_metric=xgb_RMSPE,
                          eval_set=[(X_train[i_second, :], y_train[i_second])],
                          early_stopping_rounds=50, 
                          verbose=True)
        fitted_models.append(model)
    return fitted_models

In [6]:
def train(df):
    X_train = df.ix[pd.notnull(df['Sales']), regressors].as_matrix()
    y_train = df.ix[pd.notnull(df['Sales']), 'Sales'].values
    fitted_models = []
    
    param = {
        'max_depth': 9,
        'eta': 0.3,
        'gamma': 0,  # nodes penalty
        'min_child_weight': 1,
        'max_delta_step': 0,
        'subsample': 1,
        'colsample_bytree': 1,
        'lambda': 1, # L2
        'alpha': 0,  # L1
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        #'nthread': 20,
        'silent': 0
    }
    
    num_round = 10000
    
    for i_first, i_second in KFold(y_train.shape[0], n_folds=5):
        dtrain = xgb.DMatrix(X_train[i_first, :], label=y_train[i_first])
        deval = xgb.DMatrix(X_train[i_second, :], label=y_train[i_second])
        watchlist  = [(deval,'eval')]
        bst = xgb.train(param, dtrain, num_round, watchlist,
                        feval=xgb_RMSPE,
                        #obj=xgb_obj_RMSPE_sq,
                        early_stopping_rounds=50,
                        verbose_eval=True)
        fitted_models.append(bst)
    return fitted_models

In [7]:
get_regressors(df_test)

['Open',
 'Promo',
 'SchoolHoliday',
 'DayOfWeek',
 'Year_2013',
 'Year_2014',
 'Year_2015',
 'Month_1',
 'Month_2',
 'Month_3',
 'Month_4',
 'Month_5',
 'Month_6',
 'Month_7',
 'Month_8',
 'Month_9',
 'Month_10',
 'Month_11',
 'Month_12',
 'StoreType_a',
 'StoreType_b',
 'StoreType_c',
 'StoreType_d',
 'Assortment_a',
 'Assortment_b',
 'Assortment_c',
 'CompetitionDistance',
 'LogCompetitionDistance',
 'Promo2',
 'PromoInterval_Feb,May,Aug,Nov',
 'PromoInterval_Jan,Apr,Jul,Oct',
 'PromoInterval_Mar,Jun,Sept,Dec',
 'CompetitionOpenSinceYear_0.0',
 'CompetitionOpenSinceYear_1.0',
 'CompetitionOpenSinceYear_2.0',
 'CompetitionOpenSinceYear_3.0',
 'CompetitionOpenSinceYear_5.0',
 'CompetitionOpenSinceYear_10.0',
 'StateHoliday_0',
 'StateHoliday_a',
 'StateHoliday_b',
 'StateHoliday_c',
 'PrevDaySale_77',
 'PrevDaySale_70',
 'PrevDaySale_49',
 'PrevDaySale_56',
 'PrevDaySale_84',
 'PrevDaySale_63',
 'PrevDaySale_1',
 'PrevDaySale_2',
 'PrevDaySale_3',
 'PrevDaySale_4',
 'PrevDaySale_5',
 

In [5]:
regressors = get_regressors(df_test)
fitted_models = train(df_test)

Will train until validation_0 error hasn't decreased in 50 rounds.
[0]	validation_0-RMSPE:0.898185
[1]	validation_0-RMSPE:0.806985
[2]	validation_0-RMSPE:0.725192
[3]	validation_0-RMSPE:0.651980
[4]	validation_0-RMSPE:0.586428
[5]	validation_0-RMSPE:0.527885
[6]	validation_0-RMSPE:0.475616
[7]	validation_0-RMSPE:0.429088
[8]	validation_0-RMSPE:0.387714
[9]	validation_0-RMSPE:0.350999
[10]	validation_0-RMSPE:0.318560
[11]	validation_0-RMSPE:0.289934
[12]	validation_0-RMSPE:0.264857
[13]	validation_0-RMSPE:0.243003
[14]	validation_0-RMSPE:0.224063
[15]	validation_0-RMSPE:0.207778
[16]	validation_0-RMSPE:0.193894
[17]	validation_0-RMSPE:0.182168
[18]	validation_0-RMSPE:0.172187
[19]	validation_0-RMSPE:0.163960
[20]	validation_0-RMSPE:0.157168
[21]	validation_0-RMSPE:0.151711
[22]	validation_0-RMSPE:0.147123
[23]	validation_0-RMSPE:0.143536
[24]	validation_0-RMSPE:0.140656
[25]	validation_0-RMSPE:0.138257
[26]	validation_0-RMSPE:0.136542
[27]	validation_0-RMSPE:0.135088
[28]	validation_0-R

In [91]:
y_pred = predict_df(df_test, fitted_models)

1
17.161563
2
0.04426
3
4.039274
4
0.007215
5
0.008745
6
0.008295
7
4.142776
8
4.325389
9
4.211281
10
4.426095
11
4.199136
12
4.141969
13
4.152759
14
4.232591
15
4.028509
16
4.079601
17
0.007332
18
0.008177
19
4.010351
20
4.164168
21
4.377827
22
4.061143
23
4.085827
24
4.317783
25
4.120829
26
0.007452
27
4.009501
28
0.0071
29
3.998785
30
4.119135
31
4.195462
32
4.086226
33
4.077203
34
0.007168
35
3.993801
36
4.041541
37
0.007523
38
4.185519
39
4.160401
40
4.272988
41
4.111546
42
4.116259
43
4.317401
44
0.007163
45
4.161594
46
4.183097
47
4.096571
48
4.085658
49
4.095771
50
4.090312
51
4.034806
52
3.979537
53
4.254537
54
0.007424
55
0.009254
56
4.123105
57
0.007334
58
4.141203
59
0.007631
60
0.008371
61
4.210759
62
4.405041
63
4.248527
64
4.270964
65
0.007276
66
4.077787
67
4.254579
68
4.154616
69
4.139823
70
3.983476
71
4.012325
72
4.124444
73
4.086913
74
4.438838
75
4.048091
76
4.221849
77
4.24435
78
0.007239
79
4.009151
80
4.029106
81
3.98493
82
4.033199
83
4.020679
84
4.060362
85
0.

In [92]:
id_pred = test_merge(df_test, y_pred)

In [98]:
write_submit(id_pred)