In [1]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../../')

Populating the interactive namespace from numpy and matplotlib


In [2]:
total_data_df = pd.read_pickle('../../data/data_df.pkl')
SYMBOL = 'AAPL'
data_df = total_data_df[SYMBOL].unstack()
data_df.head()

feature,Close,High,Low,Open,Volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1993-01-29,2.12,2.19,2.11,0.0,66525200.0
1993-02-01,2.19,2.19,2.12,0.0,60138400.0
1993-02-02,2.15,2.2,2.15,0.0,45584000.0
1993-02-03,2.14,2.18,2.09,0.0,66046400.0
1993-02-04,2.12,2.15,2.11,0.0,52038000.0


In [8]:
sub_data_df = data_df.iloc[:20]
sub_data_df.shape

(20, 5)

In [9]:
data = sub_data_df.values
data

array([[  2.12000000e+00,   2.19000000e+00,   2.11000000e+00,
          0.00000000e+00,   6.65252000e+07],
       [  2.19000000e+00,   2.19000000e+00,   2.12000000e+00,
          0.00000000e+00,   6.01384000e+07],
       [  2.15000000e+00,   2.20000000e+00,   2.15000000e+00,
          0.00000000e+00,   4.55840000e+07],
       [  2.14000000e+00,   2.18000000e+00,   2.09000000e+00,
          0.00000000e+00,   6.60464000e+07],
       [  2.12000000e+00,   2.15000000e+00,   2.11000000e+00,
          0.00000000e+00,   5.20380000e+07],
       [  2.04000000e+00,   2.12000000e+00,   2.01000000e+00,
          0.00000000e+00,   9.19044000e+07],
       [  2.02000000e+00,   2.05000000e+00,   1.98000000e+00,
          0.00000000e+00,   7.02688000e+07],
       [  2.03000000e+00,   2.05000000e+00,   2.02000000e+00,
          0.00000000e+00,   5.96652000e+07],
       [  1.99000000e+00,   2.04000000e+00,   1.96000000e+00,
          0.00000000e+00,   6.70712000e+07],
       [  1.97000000e+00,   2.0100000

### Let's try with multi-index

In [12]:
stacked_df = sub_data_df.stack()
stacked_df.head(15)

date        feature
1993-01-29  Close             2.12
            High              2.19
            Low               2.11
            Open              0.00
            Volume     66525200.00
1993-02-01  Close             2.19
            High              2.19
            Low               2.12
            Open              0.00
            Volume     60138400.00
1993-02-02  Close             2.15
            High              2.20
            Low               2.15
            Open              0.00
            Volume     45584000.00
dtype: float64

In [16]:
stacked_df.groupby(level=1).rolling(5).mean()

feature  date        feature
Close    1993-01-29  Close               NaN
         1993-02-01  Close               NaN
         1993-02-02  Close               NaN
         1993-02-03  Close               NaN
         1993-02-04  Close      2.144000e+00
         1993-02-05  Close      2.128000e+00
         1993-02-08  Close      2.094000e+00
         1993-02-09  Close      2.070000e+00
         1993-02-10  Close      2.040000e+00
         1993-02-11  Close      2.010000e+00
         1993-02-12  Close      1.986000e+00
         1993-02-16  Close      1.960000e+00
         1993-02-17  Close      1.938000e+00
         1993-02-18  Close      1.932000e+00
         1993-02-19  Close      1.930000e+00
         1993-02-22  Close      1.940000e+00
         1993-02-23  Close      1.950000e+00
         1993-02-24  Close      1.950000e+00
         1993-02-25  Close      1.950000e+00
         1993-02-26  Close      1.936000e+00
High     1993-01-29  High                NaN
         1993-02-01  High 

In [17]:
stacked_df.groupby(level=1, group_keys=False).rolling(5).mean()

date        feature
1993-01-29  Close               NaN
1993-02-01  Close               NaN
1993-02-02  Close               NaN
1993-02-03  Close               NaN
1993-02-04  Close      2.144000e+00
1993-02-05  Close      2.128000e+00
1993-02-08  Close      2.094000e+00
1993-02-09  Close      2.070000e+00
1993-02-10  Close      2.040000e+00
1993-02-11  Close      2.010000e+00
1993-02-12  Close      1.986000e+00
1993-02-16  Close      1.960000e+00
1993-02-17  Close      1.938000e+00
1993-02-18  Close      1.932000e+00
1993-02-19  Close      1.930000e+00
1993-02-22  Close      1.940000e+00
1993-02-23  Close      1.950000e+00
1993-02-24  Close      1.950000e+00
1993-02-25  Close      1.950000e+00
1993-02-26  Close      1.936000e+00
1993-01-29  High                NaN
1993-02-01  High                NaN
1993-02-02  High                NaN
1993-02-03  High                NaN
1993-02-04  High       2.182000e+00
1993-02-05  High       2.168000e+00
1993-02-08  High       2.140000e+00
1993-02-

In [18]:
stacked_df.groupby(level=1, group_keys=False).rolling(5).mean().unstack()

feature,Close,High,Low,Open,Volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1993-01-29,,,,,
1993-02-01,,,,,
1993-02-02,,,,,
1993-02-03,,,,,
1993-02-04,2.144,2.182,2.116,0.0,58066400.0
1993-02-05,2.128,2.168,2.096,0.0,63142240.0
1993-02-08,2.094,2.14,2.068,0.0,65168320.0
1993-02-09,2.07,2.11,2.042,0.0,67984560.0
1993-02-10,2.04,2.082,2.016,0.0,68189520.0
1993-02-11,2.01,2.054,1.986,0.0,66195360.0
