In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np

In [2]:
pandas_df = pd.read_pickle("./raw_weekly_df.pkl")
dask_df = dd.from_pandas(pandas_df, npartitions=8)

In [3]:
print(pandas_df.head())

   dept_id cat_id      item_id state_id store_id   datetime  sales
0  FOODS_1  FOODS  FOODS_1_001       CA     CA_1 2011-01-31    3.0
1  FOODS_1  FOODS  FOODS_1_001       CA     CA_1 2011-02-07    9.0
2  FOODS_1  FOODS  FOODS_1_001       CA     CA_1 2011-02-14    9.0
3  FOODS_1  FOODS  FOODS_1_001       CA     CA_1 2011-02-21    8.0
4  FOODS_1  FOODS  FOODS_1_001       CA     CA_1 2011-02-28   14.0


In [5]:
def standardize(a):
  astd = np.std(a)
  mean = a.mean()
  b = a.copy()
  b -= b.mean()
  b /= astd
  return b,mean,astd

In [6]:
#Unstandarizes a series, given it's mean and std.
def unstandardize(a,mean,astd):
    a = a*astd
    a = a + mean
    return a

In [7]:
print(pandas_df['sales'].head())

0     3.0
1     9.0
2     9.0
3     8.0
4    14.0
Name: sales, dtype: float32


In [8]:
b,mean,astd = standardize(pandas_df['sales'])
print(b)
print(mean)
print(astd)

0         -0.259046
1         -0.020909
2         -0.020909
3         -0.060599
4          0.177539
             ...   
6841116   -0.378115
6841117   -0.378115
6841118   -0.378115
6841119   -0.378115
6841120   -0.378115
Name: sales, Length: 6841121, dtype: float32
9.526814460754395
25.195552825927734


In [9]:
print(unstandardize(b,mean,astd).head())

0     3.0
1     9.0
2     9.0
3     8.0
4    14.0
Name: sales, dtype: float32


In [17]:
def stat_feature(df1,grouping,col):
    df = df1.copy()
    mean = df.groupby([grouping])[col].mean()
    mi = df.groupby([grouping])[col].min()
    ma = df.groupby([grouping])[col].max()
    return mean, mi, ma

In [18]:
def agg_feature(df1,grouping,col):
    df = df1.copy()
    return df.groupby([grouping])[col].cumsum()

In [19]:
print(stat_feature(pandas_df,'state_id','sales'))
print(agg_feature(pandas_df,'state_id','sales'))

(state_id
CA    10.585984
TX     9.033415
WI     8.882002
Name: sales, dtype: float32, state_id
CA    0.0
TX    0.0
WI    0.0
Name: sales, dtype: float32, state_id
CA    4220.000488
TX    1871.000122
WI    1755.000122
Name: sales, dtype: float32)
0                 3.0
1                12.0
2                21.0
3                29.0
4                43.0
              ...    
6841116    18120856.0
6841117    18120856.0
6841118    18120856.0
6841119    18120856.0
6841120    18120856.0
Name: sales, Length: 6841121, dtype: float32


In [22]:
def detrend_feature(df1,grouping,col):
    df = df1.copy()
    series = df[col]
    X = series.values
    diff = list()
    diff.append(np.nan)
    for i in range(1, len(X)):
        value = X[i] - X[i - 1]
        diff.append(value)
    return pd.series(diff)

In [None]:
print(detrend_feature(pandas_df,'state_id','sales'))