In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import itertools
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
aal = pdr.get_data_google('AAPL', '2000-01-01', '2017-01-01')

In [3]:
aal.drop('Volume', axis=1, inplace=True)
aal.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-02-06,1.44,1.53,1.43,1.51
2001-02-07,1.48,1.49,1.42,1.48
2001-02-08,1.47,1.5,1.44,1.48
2001-02-09,1.46,1.49,1.33,1.37
2001-02-12,1.36,1.43,1.34,1.41


In [4]:
def _set_index_column_if_necessary(data: pd.DataFrame) -> pd.DataFrame:
    if 'Date' in data.columns:
        data.set_index('Date', inplace=True)
    return data

In [5]:
aal = _set_index_column_if_necessary(aal)
cols = aal.columns
aal.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-02-06,1.44,1.53,1.43,1.51
2001-02-07,1.48,1.49,1.42,1.48
2001-02-08,1.47,1.5,1.44,1.48
2001-02-09,1.46,1.49,1.33,1.37
2001-02-12,1.36,1.43,1.34,1.41


In [6]:
cols

Index(['Open', 'High', 'Low', 'Close'], dtype='object')

In [8]:
return_days = 3

In [11]:
def feature(data, first_col, second_col, base_col):
    return (data[first_col]-data[second_col])/data[base_col]

In [12]:
def create_extra_columns(data, cols):
    pool = []
    for left, right in itertools.product(cols, cols):
        pair1 = left + right
        pair2 = right + left
        if left != right and pair1 not in pool and pair2 not in pool:
            aal[left + '/' + right] = feature(data, left, right, 'Close')
            pool.append(pair1)

In [13]:
create_extra_columns(aal, cols)

In [14]:
aal.columns

Index(['Open', 'High', 'Low', 'Close', 'Open/High', 'Open/Low', 'Open/Close',
       'High/Low', 'High/Close', 'Low/Close'],
      dtype='object')

In [15]:
days=[5,10]

for col, day in itertools.product(cols, days):
    aal[col + ' ' + str(day) + ' MA'] = aal[col].rolling(day).mean()
    aal[col + ' ' + str(day) + ' max'] = aal[col].rolling(day).max()
    aal[col + ' ' + str(day) + ' min'] = aal[col].rolling(day).min()
aal.dropna(inplace=True)

In [16]:
aal.columns

Index(['Open', 'High', 'Low', 'Close', 'Open/High', 'Open/Low', 'Open/Close',
       'High/Low', 'High/Close', 'Low/Close', 'Open 5 MA', 'Open 5 max',
       'Open 5 min', 'Open 10 MA', 'Open 10 max', 'Open 10 min', 'High 5 MA',
       'High 5 max', 'High 5 min', 'High 10 MA', 'High 10 max', 'High 10 min',
       'Low 5 MA', 'Low 5 max', 'Low 5 min', 'Low 10 MA', 'Low 10 max',
       'Low 10 min', 'Close 5 MA', 'Close 5 max', 'Close 5 min', 'Close 10 MA',
       'Close 10 max', 'Close 10 min'],
      dtype='object')

In [18]:
rolling_features = list(filter(lambda col: '/' not in col, aal.columns))
rolling_features

['Open',
 'High',
 'Low',
 'Close',
 'Open 5 MA',
 'Open 5 max',
 'Open 5 min',
 'Open 10 MA',
 'Open 10 max',
 'Open 10 min',
 'High 5 MA',
 'High 5 max',
 'High 5 min',
 'High 10 MA',
 'High 10 max',
 'High 10 min',
 'Low 5 MA',
 'Low 5 max',
 'Low 5 min',
 'Low 10 MA',
 'Low 10 max',
 'Low 10 min',
 'Close 5 MA',
 'Close 5 max',
 'Close 5 min',
 'Close 10 MA',
 'Close 10 max',
 'Close 10 min']

In [19]:
create_extra_columns(aal, rolling_features)

In [20]:
ret = 100*aal['Close'].pct_change(return_days).shift(-return_days)

features_to_drop = list(filter(lambda col: '/' not in col, aal.columns))
aal.drop(features_to_drop, axis=1, inplace=True)

aal['Return'] = ret
aal.head(10)

Unnamed: 0_level_0,Open/High,Open/Low,Open/Close,High/Low,High/Close,Low/Close,Open/Open 5 MA,Open/Open 5 max,Open/Open 5 min,Open/Open 10 MA,...,Close 5 max/Close 10 MA,Close 5 max/Close 10 max,Close 5 max/Close 10 min,Close 5 min/Close 10 MA,Close 5 min/Close 10 max,Close 5 min/Close 10 min,Close 10 MA/Close 10 max,Close 10 MA/Close 10 min,Close 10 max/Close 10 min,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-02-20,-0.015267,0.053435,0.045802,0.068702,0.061069,-0.007634,-0.01221374,-0.038168,0.007634,-0.033588,...,0.014504,-0.061069,0.091603,-0.077099,-0.152672,0.0,-0.075573,0.077099,0.152672,2.290076
2001-02-21,-0.088889,0.0,-0.037037,0.088889,0.051852,-0.037037,-0.04592593,-0.081481,0.0,-0.074074,...,0.025926,-0.037037,0.088889,-0.062963,-0.125926,0.0,-0.062963,0.062963,0.125926,2.962963
2001-02-22,-0.014925,0.052239,0.014925,0.067164,0.029851,-0.037313,3.314099e-16,-0.037313,0.044776,-0.020896,...,0.036567,-0.037313,0.089552,-0.052985,-0.126866,0.0,-0.073881,0.052985,0.126866,2.985075
2001-02-23,-0.014925,0.022388,-0.007463,0.037313,0.007463,-0.029851,-0.01044776,-0.029851,0.022388,-0.032836,...,-0.005224,-0.052239,0.037313,-0.042537,-0.089552,0.0,-0.047015,0.042537,0.089552,-2.985075
2001-02-26,-0.035971,0.021583,-0.021583,0.057554,0.014388,-0.043165,0.01151079,-0.007194,0.043165,-0.002878,...,0.015108,-0.028777,0.057554,-0.042446,-0.086331,0.0,-0.043885,0.042446,0.086331,-3.597122
2001-02-27,-0.007246,0.036232,0.0,0.043478,0.007246,-0.036232,0.02463768,0.0,0.057971,0.010145,...,0.017391,-0.028986,0.057971,-0.018841,-0.065217,0.021739,-0.046377,0.04058,0.086957,0.0
2001-02-28,-0.007692,0.069231,0.061538,0.076923,0.069231,-0.007692,0.01384615,0.0,0.038462,0.013846,...,0.023846,-0.030769,0.069231,-0.045385,-0.1,0.0,-0.054615,0.045385,0.1,12.307692
2001-03-01,-0.052239,0.029851,-0.052239,0.08209,0.0,-0.08209,-0.05522388,-0.08209,0.0,-0.061194,...,0.026866,-0.029851,0.067164,-0.040299,-0.097015,0.0,-0.056716,0.040299,0.097015,14.925373
2001-03-02,-0.108696,0.007246,-0.050725,0.115942,0.057971,-0.057971,-0.02173913,-0.050725,0.028986,-0.023188,...,0.02971,0.0,0.065217,-0.035507,-0.065217,0.0,-0.02971,0.035507,0.065217,10.144928
2001-03-05,-0.054795,0.006849,-0.054795,0.061644,0.0,-0.061644,0.02465753,0.0,0.075342,0.024658,...,0.069178,0.0,0.109589,-0.040411,-0.109589,0.0,-0.069178,0.040411,0.109589,2.054795


In [21]:
len(aal.columns)

379

In [22]:
list(aal.columns)

['Open/High',
 'Open/Low',
 'Open/Close',
 'High/Low',
 'High/Close',
 'Low/Close',
 'Open/Open 5 MA',
 'Open/Open 5 max',
 'Open/Open 5 min',
 'Open/Open 10 MA',
 'Open/Open 10 max',
 'Open/Open 10 min',
 'Open/High 5 MA',
 'Open/High 5 max',
 'Open/High 5 min',
 'Open/High 10 MA',
 'Open/High 10 max',
 'Open/High 10 min',
 'Open/Low 5 MA',
 'Open/Low 5 max',
 'Open/Low 5 min',
 'Open/Low 10 MA',
 'Open/Low 10 max',
 'Open/Low 10 min',
 'Open/Close 5 MA',
 'Open/Close 5 max',
 'Open/Close 5 min',
 'Open/Close 10 MA',
 'Open/Close 10 max',
 'Open/Close 10 min',
 'High/Open 5 MA',
 'High/Open 5 max',
 'High/Open 5 min',
 'High/Open 10 MA',
 'High/Open 10 max',
 'High/Open 10 min',
 'High/High 5 MA',
 'High/High 5 max',
 'High/High 5 min',
 'High/High 10 MA',
 'High/High 10 max',
 'High/High 10 min',
 'High/Low 5 MA',
 'High/Low 5 max',
 'High/Low 5 min',
 'High/Low 10 MA',
 'High/Low 10 max',
 'High/Low 10 min',
 'High/Close 5 MA',
 'High/Close 5 max',
 'High/Close 5 min',
 'High/Close 

# X/y spit

In [23]:
test_date = datetime(2015,1,1).strftime('%Y-%m-%d')
aal.dropna(inplace=True)
X_data = aal.drop('Return', axis=1)
y_data = aal['Return']

In [24]:
X_data.head(2)

Unnamed: 0_level_0,Open/High,Open/Low,Open/Close,High/Low,High/Close,Low/Close,Open/Open 5 MA,Open/Open 5 max,Open/Open 5 min,Open/Open 10 MA,...,Close 5 max/Close 5 min,Close 5 max/Close 10 MA,Close 5 max/Close 10 max,Close 5 max/Close 10 min,Close 5 min/Close 10 MA,Close 5 min/Close 10 max,Close 5 min/Close 10 min,Close 10 MA/Close 10 max,Close 10 MA/Close 10 min,Close 10 max/Close 10 min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-02-20,-0.015267,0.053435,0.045802,0.068702,0.061069,-0.007634,-0.012214,-0.038168,0.007634,-0.033588,...,0.091603,0.014504,-0.061069,0.091603,-0.077099,-0.152672,0.0,-0.075573,0.077099,0.152672
2001-02-21,-0.088889,0.0,-0.037037,0.088889,0.051852,-0.037037,-0.045926,-0.081481,0.0,-0.074074,...,0.088889,0.025926,-0.037037,0.088889,-0.062963,-0.125926,0.0,-0.062963,0.062963,0.125926


In [25]:
y_data.head(2)

Date
2001-02-20    2.290076
2001-02-21    2.962963
Name: Return, dtype: float64

In [27]:
extreme = 3
def series_to_binarized_columns(y):
    pos = y > extreme
    neg = y < -extreme
    meds = (y > -extreme) & (y < extreme)
    y = np.array([neg, meds, pos]).T
    return y

In [28]:
X_train = X_data[:test_date]
X_test = X_data[test_date:]
y_train = y_data[:test_date]
y_test = y_data[test_date:]

In [29]:
y_train = series_to_binarized_columns(y_train)
y_test = series_to_binarized_columns(y_test)

In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test)

# KERAS

In [36]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, BatchNormalization, LSTM
from keras.optimizers import Adam
from keras.models import load_model

In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import class_weight

In [33]:
X_train.shape

(3487, 378)

In [37]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=X_train.shape))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(LSTM(64))
model.add(Dropout(0.1))

model.add(Dense(3, kernel_initializer='glorot_uniform'))
model.add(Activation('softmax'))

model.compile(optimizer=Adam(lr=1e-3),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [42]:
np.argmax(y_train, axis=1)

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [40]:
temp_y = np.argmax(y_train, axis=1)
cw = class_weight.compute_class_weight('balanced', [np.unique(y_train)], temp_y)

LOGGER.info('Class weights: ' + str(cw))

ValueError: classes should include all valid labels that can be in y

In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), class_weight=cw, batch_size=128, epochs=300, verbose=2)