In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pandas as pd
pd.set_option('display.max_columns', 500)
import pickle
import seaborn as sns
import datetime
import multiprocessing
import itertools as it
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, ElasticNetCV
sns.set_style("dark")
plt.rcParams['figure.figsize'] = 16, 12

In [2]:
df_test = pd.read_pickle('./../data/df_base.pkl')

In [153]:
def weighted_norm_mean(x, p=2.0):
    w = np.linspace(1, 0.1, x.shape[0])**p
    w = w/w.sum()
    return np.dot(x,w)

req_cols = set(['DayOfWeek', 'Store'])
cat_cols = ['Open', 'Promo', 'StateHoliday_0', 'StateHoliday_a', 'SchoolHoliday', 'HasCompetition']

In [158]:
def calculate_aggregations(df, req_cols, cat_cols, agg_func=np.mean, verbose=True):
    
    def extract_key(cols, d):
        res = []
        for c in cols:
            res.append(d[c])
        return res
    
    df = df.loc[pd.notnull(df_test['Sales'])].copy()
    df['Store'] = df.index.get_level_values(0)
    
    col_combinations = [req_cols] + \
        map(lambda a: a.union(req_cols), 
            reduce(lambda a, b: a + b, 
                   map(lambda n: map(set, it.combinations(cat_cols, n)), 
                       range(1, 6))))
        
    series_agg = df[list(req_cols)].to_dict('index')
    cols_agg = dict([(k, []) for k in series_agg.keys()])

    for cols in col_combinations:
        if verbose:
            print cols
        start_time = datetime.datetime.now()
        cols = list(cols)
        dict_agg = df.groupby(cols)['Sales'].apply(weighted_norm_mean).to_dict()
        dict_df = df[cols].to_dict('index')
        for k in dict_df.keys():
            cols_agg[k].append(dict_agg[tuple(extract_key(cols, dict_df[k]))])
        if verbose:
            print (datetime.datetime.now() - start_time).total_seconds()
    
    return cols_agg

In [159]:
agg = calculate_aggregations(df_test, req_cols, cat_cols, weighted_norm_mean)

set(['DayOfWeek', 'Store'])
73.32093
set(['DayOfWeek', 'Open', 'Store'])
48.787956
set(['Promo', 'DayOfWeek', 'Store'])
74.376554
set(['DayOfWeek', 'StateHoliday_0', 'Store'])
45.509155
set(['DayOfWeek', 'StateHoliday_a', 'Store'])
45.687825
set(['DayOfWeek', 'SchoolHoliday', 'Store'])
72.774753
set(['DayOfWeek', 'HasCompetition', 'Store'])
44.202931
set(['Promo', 'DayOfWeek', 'Open', 'Store'])
46.959709
set(['DayOfWeek', 'StateHoliday_0', 'Open', 'Store'])
46.13478
set(['DayOfWeek', 'StateHoliday_a', 'Open', 'Store'])
46.823678
set(['DayOfWeek', 'Open', 'SchoolHoliday', 'Store'])
47.887602
set(['DayOfWeek', 'HasCompetition', 'Open', 'Store'])
45.877014
set(['Promo', 'DayOfWeek', 'StateHoliday_0', 'Store'])
47.602385
set(['Promo', 'DayOfWeek', 'StateHoliday_a', 'Store'])
47.465361
set(['Promo', 'DayOfWeek', 'SchoolHoliday', 'Store'])
76.148926
set(['Promo', 'DayOfWeek', 'HasCompetition', 'Store'])
45.530697
set(['DayOfWeek', 'StateHoliday_0', 'StateHoliday_a', 'Store'])
46.796514
set([