The idea is to create features that include many stocks, and targets that are the future prices for a range of days, as well as the OHLC for each of those days.

The plan is:
* prototype feature/target creation and neural net with OHLC for a single day
* extend prototype to predict future values for a range of days
* extend prototype to include multiple stocks
* convert to a function
* put in a file and test functions

In [1]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
sorted_old = pd.read_csv('old_good_sorted_by_ewm.csv')

In [4]:
top_stocks = sorted_old.loc[:100, 'ticker'].tolist() + ['SPY', 'UPRO', 'QQQ', 'TQQQ', 'DIA', 'UBT']

In [5]:
top_stocks

['GGP',
 'LNG',
 'AKS',
 'CHK',
 'AMD',
 'CAR',
 'OLED',
 'INCY',
 'SIRI',
 'SWKS',
 'MT',
 'MU',
 'NKTR',
 'X',
 'ETFC',
 'FCX',
 'VRX',
 'REGN',
 'WDC',
 'VRTX',
 'URI',
 'CY',
 'RIG',
 'MSCC',
 'AIG',
 'AKAM',
 'RRC',
 'MGM',
 'ANDV',
 'SWN',
 'TER',
 'NVDA',
 'WMB',
 'BBY',
 'LRCX',
 'GG',
 'FFIV',
 'STLD',
 'ALXN',
 'NOV',
 'NTAP',
 'RIO',
 'PHM',
 'BKNG',
 'ABX',
 'DHI',
 'ADSK',
 'CELG',
 'BMRN',
 'SIG',
 'CGNX',
 'MOS',
 'TTWO',
 'RCL',
 'HIG',
 'ASML',
 'LEN',
 'MS',
 'JNPR',
 'AMAT',
 'AABA',
 'PXD',
 'CTXS',
 'HP',
 'BIIB',
 'HOLX',
 'SBAC',
 'EA',
 'HAL',
 'AMZN',
 'MNST',
 'HFC',
 'KLAC',
 'AMTD',
 'C',
 'KMX',
 'COG',
 'ALK',
 'LNC',
 'RHT',
 'NEM',
 'SCHW',
 'SIVB',
 'VLO',
 'APC',
 'DVN',
 'BHP',
 'IVZ',
 'M',
 'ADBE',
 'AAPL',
 'TOL',
 'COHR',
 'GLW',
 'SYMC',
 'APA',
 'ATVI',
 'JWN',
 'NVR',
 'COF',
 'ADI',
 'SPY',
 'UPRO',
 'QQQ',
 'TQQQ',
 'DIA',
 'UBT']

In [6]:
# prototype with 5 stocks so it will run fast
top_stocks = top_stocks[:5]

In [7]:
top_stocks

['GGP', 'LNG', 'AKS', 'CHK', 'AMD']

# prototype feat/targ creation and neural net with OHLC for single day

ideas:
* use TAs (especially those are scaled to a bound range)
* for anything not scaled to a bound range, take a few lagged time percent differences, e.g. 1, 2, 3, 5, 10, 20, 30, 50, 100 day differences (especially OHLCV) -- scaled by the earliest value
* predict % changed OHLCV for range of days, 1-10.  scale by earliest value

# feat/targ for OHLC for single day future and single day history

In [12]:
import sys
sys.path.append('../code')
import data_processing as dp

In [13]:
dfs, _, _ = dp.load_stocks(stocks=top_stocks,
                           finra_shorts=False,
                           short_interest=False,
                           earliest_date=None,
                           calc_scores=False)

loading stocks...
calculating TAs...


In [17]:
earliest_dates = []
for s in dfs.keys():
    earliest_dates.append(dfs[s].index.min())

In [21]:
max(earliest_dates)

Timestamp('1994-04-04 00:00:00')

In [22]:
# for now, abbreviate stocks to the latest date
for s in dfs.keys():
    dfs[s] = dfs[s][dfs[s].index > max(earliest_dates)]

In [23]:
# double check that the shapes are the same
for s in dfs.keys():
    print(dfs[s].shape)

(6027, 148)
(6027, 148)
(6027, 148)
(6027, 148)
(6027, 148)


In [14]:
dfs['GGP'].head()

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Volume,Dividend,Split,Adj_Open,Adj_High,...,ht_ph_cl,ht_ph_tp,ht_q_cl,ht_q_tp,ht_s_cl,ht_s_tp,ht_ls_cl,ht_ls_tp,ht_tr_cl,ht_tr_tp
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-08,GGP,25.0,25.0,24.25,24.63,6734567.0,0.0,1.0,2.152696,2.152696,...,0.119869,0.097287,0.121642,0.138169,0.295002,0.361137,-0.46704,-0.404024,0,0
1993-04-12,GGP,24.75,25.25,24.75,25.13,914300.0,0.0,1.0,2.122572,2.165453,...,0.119869,0.097287,0.121642,0.138169,0.295002,0.361137,-0.46704,-0.404024,0,0
1993-04-13,GGP,25.38,26.0,25.25,25.63,930500.0,0.0,1.0,2.177693,2.230892,...,0.119869,0.097287,0.121642,0.138169,0.295002,0.361137,-0.46704,-0.404024,0,0
1993-04-14,GGP,25.5,25.75,25.5,25.75,506133.0,0.0,1.0,2.195216,2.216737,...,0.119869,0.097287,0.121642,0.138169,0.295002,0.361137,-0.46704,-0.404024,0,0
1993-04-15,GGP,25.5,25.75,25.38,25.38,359233.0,0.0,1.0,2.19273,2.214227,...,0.119869,0.097287,0.121642,0.138169,0.295002,0.361137,-0.46704,-0.404024,0,0


In [16]:
dfs['GGP'].columns.tolist()

['Ticker',
 'Open',
 'High',
 'Low',
 'Close',
 'Volume',
 'Dividend',
 'Split',
 'Adj_Open',
 'Adj_High',
 'Adj_Low',
 'Adj_Close',
 'Adj_Volume',
 'typical_price',
 'bband_u_cl',
 'bband_m_cl',
 'bband_l_cl',
 'bband_u_cl_diff',
 'bband_m_cl_diff',
 'bband_l_cl_diff',
 'bband_u_cl_diff_hi',
 'bband_l_cl_diff_lo',
 'bband_u_tp',
 'bband_m_tp',
 'bband_l_tp',
 'bband_u_tp_diff',
 'bband_m_tp_diff',
 'bband_l_tp_diff',
 'bband_u_tp_diff_hi',
 'bband_l_tp_diff_lo',
 'dema_cl',
 'dema_tp',
 'dema_cl_diff',
 'dema_tp_diff',
 'ema_cl',
 'ema_tp',
 'ema_cl_diff',
 'ema_tp_diff',
 'ht_tl_cl',
 'ht_tl_tp',
 'ht_tl_cl_diff',
 'ht_tl_tp_diff',
 'kama_cl',
 'kama_tp',
 'kama_cl_diff',
 'kama_tp_diff',
 'mavp_cl',
 'mavp_tp',
 'mavp_cl_diff',
 'mavp_tp_diff',
 'midp_cl',
 'midp_tp',
 'midp_cl_diff',
 'midp_tp_diff',
 'midpr',
 'midpr_diff',
 'sar',
 'sar_diff',
 'sma_10_cl',
 'sma_10_tp',
 'sma_20_cl',
 'sma_20_tp',
 'sma_30_cl',
 'sma_30_tp',
 'sma_40_cl',
 'sma_40_tp',
 'tema_cl',
 'tema_tp',
 '

In [25]:
# drop non-adjusted columns and div/split
dropcols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Dividend', 'Split']
for s in dfs.keys():
    dfs[s].drop(dropcols, axis=1, inplace=True)

In [None]:
# scale all moving averages by typical price in past

In [None]:
# find TAS not bound to range

In [None]:
# create time-lagged percent difference features for OHLCV and TAs not bound to a range
periods = [1, 2, 3, 5, 10, 20, 30, 50, 100]


In [None]:
future_days = 1
history_days = 1
for 