In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from nasdaqpredictor import dataloader as dl

In [39]:
loader = dl.DataLoader('/nasdaq_tickers.csv',
                        datetime(2000, 1, 1),
                        datetime(2017, 1, 1))
# transformer = DataTransformer(loader, return_shift_days=-3)

In [205]:
loader.reload_all()

[2017-12-01 17:03:39,626 - nasdaqpredictor.dataloader - INFO] - Load tickers


In [206]:
aal = loader.original_data_dict['AAL']

In [207]:
aal.head()

Unnamed: 0,Date,Open,High,Low,Close
0,2005-09-27,21.049999,21.4,19.1,19.299999
1,2005-09-28,19.299999,20.530001,19.200001,20.5
2,2005-09-29,20.4,20.58,20.1,20.209999
3,2005-09-30,20.26,21.049999,20.18,21.01
4,2005-10-03,20.9,21.75,20.9,21.5


In [208]:
def _set_index_column_if_necessary(data: pd.DataFrame) -> pd.DataFrame:
    if 'Date' in data.columns:
        data.set_index('Date', inplace=True)
    return data

In [209]:
aal = _set_index_column_if_necessary(aal)
aal.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-09-27,21.049999,21.4,19.1,19.299999
2005-09-28,19.299999,20.530001,19.200001,20.5
2005-09-29,20.4,20.58,20.1,20.209999
2005-09-30,20.26,21.049999,20.18,21.01
2005-10-03,20.9,21.75,20.9,21.5


In [210]:
return_days = 1

In [211]:
def feature(data, first_col, second_col, base_col):
    return (data[first_col]-data[second_col])/data[base_col]

In [212]:
aal['OC diff'] = feature(aal, 'Open', 'Close', 'Close')
aal['HL diff'] = feature(aal, 'High', 'Low', 'Close')
aal['OL diff'] = feature(aal, 'Open', 'Low', 'Close')
aal['CH diff'] = feature(aal, 'Close', 'High', 'Close')
aal['Return'] = 100*aal['Close'].pct_change(return_days).shift(-return_days)
aal.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,OC diff,HL diff,OL diff,CH diff,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-09-27,21.049999,21.4,19.1,19.299999,0.090674,0.119171,0.101036,-0.108808,6.217622
2005-09-28,19.299999,20.530001,19.200001,20.5,-0.058537,0.064878,0.004878,-0.001463,-1.414639
2005-09-29,20.4,20.58,20.1,20.209999,0.009401,0.023751,0.014844,-0.018308,3.958442
2005-09-30,20.26,21.049999,20.18,21.01,-0.035697,0.041409,0.003808,-0.001904,2.332223
2005-10-03,20.9,21.75,20.9,21.5,-0.027907,0.039535,0.0,-0.011628,3.069767
2005-10-04,21.440001,22.5,21.440001,22.16,-0.032491,0.047834,0.0,-0.015343,0.18051
2005-10-05,22.1,22.309999,21.75,22.200001,-0.004505,0.025225,0.015766,-0.004955,1.711707
2005-10-06,22.6,23.0,22.4,22.58,0.000886,0.026572,0.008857,-0.018601,-1.90434
2005-10-07,22.25,22.6,21.799999,22.15,0.004515,0.036117,0.020316,-0.020316,0.270876
2005-10-10,22.280001,22.290001,22.1,22.209999,0.003152,0.008555,0.008105,-0.003602,-1.035565


In [213]:
aal.iloc[:,0:4].pct_change().head(2)

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-09-27,,,,
2005-09-28,-0.083135,-0.040654,0.005236,0.062176


In [157]:
aal.iloc[:,4:8].head(2)

Unnamed: 0_level_0,OC diff,HL diff,OL diff,CH diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-09-27,1.75,2.3,1.949999,-2.100001
2005-09-28,-1.200001,1.33,0.099998,-0.030001


In [223]:
full = pd.concat((aal.iloc[:,0:4].pct_change(), aal.iloc[:,4:8], aal['Return']), axis=1)
full = full.iloc[return_days:]

In [224]:
full.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,OC diff,HL diff,OL diff,CH diff,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-09-28,-0.083135,-0.040654,0.005236,0.062176,-0.058537,0.064878,0.004878,-0.001463,-1.414639
2005-09-29,0.056995,0.002435,0.046875,-0.014146,0.009401,0.023751,0.014844,-0.018308,3.958442
2005-09-30,-0.006863,0.022838,0.00398,0.039584,-0.035697,0.041409,0.003808,-0.001904,2.332223
2005-10-03,0.031589,0.033254,0.035679,0.023322,-0.027907,0.039535,0.0,-0.011628,3.069767
2005-10-04,0.025837,0.034483,0.025837,0.030698,-0.032491,0.047834,0.0,-0.015343,0.18051
2005-10-05,0.030784,-0.008444,0.014459,0.001805,-0.004505,0.025225,0.015766,-0.004955,1.711707
2005-10-06,0.022624,0.030928,0.029885,0.017117,0.000886,0.026572,0.008857,-0.018601,-1.90434
2005-10-07,-0.015487,-0.017391,-0.026786,-0.019043,0.004515,0.036117,0.020316,-0.020316,0.270876
2005-10-10,0.001348,-0.013717,0.013762,0.002709,0.003152,0.008555,0.008105,-0.003602,-1.035565
2005-10-11,-0.000898,0.000449,-0.013575,-0.010356,0.012739,0.022748,0.020928,-0.014559,-1.00091


In [197]:
full = full.replace([np.inf, -np.inf, np.NaN, np.NAN], 0.0)

In [220]:
full.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,OC diff,HL diff,OL diff,CH diff,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-09-28,-0.083135,-0.040654,0.005236,0.062176,-0.058537,0.064878,0.004878,-0.001463,-1.414639
2005-09-29,0.056995,0.002435,0.046875,-0.014146,0.009401,0.023751,0.014844,-0.018308,3.958442
2005-09-30,-0.006863,0.022838,0.00398,0.039584,-0.035697,0.041409,0.003808,-0.001904,2.332223
2005-10-03,0.031589,0.033254,0.035679,0.023322,-0.027907,0.039535,0.0,-0.011628,3.069767
2005-10-04,0.025837,0.034483,0.025837,0.030698,-0.032491,0.047834,0.0,-0.015343,0.18051
2005-10-05,0.030784,-0.008444,0.014459,0.001805,-0.004505,0.025225,0.015766,-0.004955,1.711707
2005-10-06,0.022624,0.030928,0.029885,0.017117,0.000886,0.026572,0.008857,-0.018601,-1.90434
2005-10-07,-0.015487,-0.017391,-0.026786,-0.019043,0.004515,0.036117,0.020316,-0.020316,0.270876
2005-10-10,0.001348,-0.013717,0.013762,0.002709,0.003152,0.008555,0.008105,-0.003602,-1.035565
2005-10-11,-0.000898,0.000449,-0.013575,-0.010356,0.012739,0.022748,0.020928,-0.014559,-1.00091


In [283]:
n = 30  #chunk row size
list_df = [full.drop('Return', axis=1).iloc[i:i+n].values for i in range(0,full.shape[0]-n)]
list_rets = [full['Return'][i] for i in range(0,full.shape[0]-n)]

In [284]:
len(aal)

2836

In [285]:
print(len(list_df), len(list_rets))

2805 2805


In [286]:
assert len(list_df[0]) == len(list_df[-1])

In [287]:
list_df[0].shape

(30, 8)

In [288]:
np.stack(list_df).shape

(2805, 30, 8)

In [297]:
np.array(list_rets).shape

(2805,)