In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import itertools
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
aal = pdr.get_data_google('AAPL', '2000-01-01', '2017-01-01')

In [3]:
aal.drop('Volume', axis=1, inplace=True)
aal.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-02-06,1.44,1.53,1.43,1.51
2001-02-07,1.48,1.49,1.42,1.48
2001-02-08,1.47,1.5,1.44,1.48
2001-02-09,1.46,1.49,1.33,1.37
2001-02-12,1.36,1.43,1.34,1.41


In [4]:
def _set_index_column_if_necessary(data: pd.DataFrame) -> pd.DataFrame:
    if 'Date' in data.columns:
        data.set_index('Date', inplace=True)
    return data

In [5]:
aal = _set_index_column_if_necessary(aal)
cols = aal.columns
aal.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-02-06,1.44,1.53,1.43,1.51
2001-02-07,1.48,1.49,1.42,1.48
2001-02-08,1.47,1.5,1.44,1.48
2001-02-09,1.46,1.49,1.33,1.37
2001-02-12,1.36,1.43,1.34,1.41


In [6]:
cols

Index(['Open', 'High', 'Low', 'Close'], dtype='object')

In [7]:
return_days = 5

In [8]:
def feature(data, first_col, second_col):
    return data[first_col]/data[second_col]

In [9]:
def create_extra_columns(data, cols):
    pool = []
    for left, right in itertools.product(cols, cols):
        pair1 = left + right
        pair2 = right + left
        if left != right and pair1 not in pool and pair2 not in pool:
            aal[left + '/' + right] = feature(data, left, right)
            pool.append(pair1)

In [10]:
days=[5, 10]

for col, day in itertools.product(cols, days):
    aal[col + ' ' + str(day) + ' MA'] = aal[col].rolling(day).mean()
    aal[col + ' ' + str(day) + ' max'] = aal[col].rolling(day).max()
    aal[col + ' ' + str(day) + ' min'] = aal[col].rolling(day).min()
aal.dropna(inplace=True)

In [11]:
aal.columns

Index(['Open', 'High', 'Low', 'Close', 'Open 5 MA', 'Open 5 max', 'Open 5 min',
       'Open 10 MA', 'Open 10 max', 'Open 10 min', 'High 5 MA', 'High 5 max',
       'High 5 min', 'High 10 MA', 'High 10 max', 'High 10 min', 'Low 5 MA',
       'Low 5 max', 'Low 5 min', 'Low 10 MA', 'Low 10 max', 'Low 10 min',
       'Close 5 MA', 'Close 5 max', 'Close 5 min', 'Close 10 MA',
       'Close 10 max', 'Close 10 min'],
      dtype='object')

In [12]:
rolling_features = list(filter(lambda col: '/' not in col, aal.columns))

In [13]:
create_extra_columns(aal, rolling_features)

In [14]:
ret = 100*aal['Close'].pct_change(return_days).shift(-return_days)
ret_dev = ret.rolling(return_days).std().fillna(1)

In [15]:
ret_scaled = ret/ret_dev
ret_scaled.head()

Date
2001-02-20    5.343511
2001-02-21   -3.703704
2001-02-22    0.000000
2001-02-23    2.985075
2001-02-26    1.323929
Name: Close, dtype: float64

In [16]:
features_to_drop = list(filter(lambda col: '/' not in col, aal.columns))
aal.drop(features_to_drop, axis=1, inplace=True)

aal['Return'] = ret  # ret_scaled
aal.head(10)

Unnamed: 0_level_0,Open/High,Open/Low,Open/Close,Open/Open 5 MA,Open/Open 5 max,Open/Open 5 min,Open/Open 10 MA,Open/Open 10 max,Open/Open 10 min,Open/High 5 MA,...,Close 5 max/Close 10 MA,Close 5 max/Close 10 max,Close 5 max/Close 10 min,Close 5 min/Close 10 MA,Close 5 min/Close 10 max,Close 5 min/Close 10 min,Close 10 MA/Close 10 max,Close 10 MA/Close 10 min,Close 10 max/Close 10 min,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-02-20,0.985612,1.053846,1.045802,0.988456,0.964789,1.007353,0.968883,0.925676,1.007353,0.962079,...,1.013466,0.94702,1.091603,0.92842,0.86755,1.0,0.934437,1.077099,1.152672,5.343511
2001-02-21,0.915493,1.0,0.962963,0.954479,0.921986,1.0,0.928571,0.878378,1.0,0.918079,...,1.02509,0.966216,1.091603,0.939068,0.885135,1.0,0.942568,1.064885,1.129771,-3.703704
2001-02-22,0.985507,1.054264,1.014925,1.0,0.964539,1.046154,0.979827,0.92517,1.046154,0.964539,...,1.035482,0.966216,1.091603,0.948588,0.885135,1.0,0.933108,1.054198,1.129771,0.0
2001-02-23,0.985185,1.023077,0.992537,0.989583,0.970803,1.023077,0.967977,0.910959,1.023077,0.959596,...,0.994879,0.951049,1.038168,0.958303,0.916084,1.0,0.955944,1.043511,1.091603,2.985075
2001-02-26,0.964539,1.022556,0.978417,1.011905,0.992701,1.046154,0.997067,0.957746,1.046154,0.978417,...,1.01534,0.972028,1.061069,0.956903,0.916084,1.0,0.957343,1.045038,1.091603,5.035971
2001-02-27,0.992806,1.037594,1.0,1.02526,1.0,1.061538,1.010249,0.971831,1.061538,0.992806,...,1.01757,0.972028,1.061069,0.980966,0.937063,1.022901,0.955245,1.042748,1.091603,11.594203
2001-02-28,0.992806,1.069767,1.061538,1.013216,1.0,1.037594,1.013216,0.978723,1.061538,0.99711,...,1.022811,0.972028,1.069231,0.956586,0.909091,1.0,0.95035,1.045385,1.1,16.923077
2001-03-01,0.947761,1.03252,0.947761,0.94494,0.92029,1.0,0.939349,0.900709,1.0,0.922965,...,1.026588,0.972028,1.069231,0.960118,0.909091,1.0,0.946853,1.041538,1.1,11.19403
2001-03-02,0.89726,1.007692,0.949275,0.977612,0.949275,1.031496,0.976155,0.949275,1.031496,0.937053,...,1.030393,1.0,1.069231,0.963677,0.935252,1.0,0.970504,1.037692,1.069231,5.072464
2001-03-05,0.945205,1.007299,0.945205,1.026786,1.0,1.086614,1.026786,1.0,1.086614,0.980114,...,1.074319,1.0,1.123077,0.956586,0.890411,1.0,0.930822,1.045385,1.123077,-8.90411


In [17]:
len(aal.columns)

379

In [18]:
list(aal.columns)[:5]

['Open/High', 'Open/Low', 'Open/Close', 'Open/Open 5 MA', 'Open/Open 5 max']

# X/y spit

In [19]:
test_date = datetime(2013,1,1).strftime('%Y-%m-%d')
aal.dropna(inplace=True)
X_data = aal.drop('Return', axis=1)
y_data = aal['Return']

In [20]:
X_data.head(2)

Unnamed: 0_level_0,Open/High,Open/Low,Open/Close,Open/Open 5 MA,Open/Open 5 max,Open/Open 5 min,Open/Open 10 MA,Open/Open 10 max,Open/Open 10 min,Open/High 5 MA,...,Close 5 max/Close 5 min,Close 5 max/Close 10 MA,Close 5 max/Close 10 max,Close 5 max/Close 10 min,Close 5 min/Close 10 MA,Close 5 min/Close 10 max,Close 5 min/Close 10 min,Close 10 MA/Close 10 max,Close 10 MA/Close 10 min,Close 10 max/Close 10 min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-02-20,0.985612,1.053846,1.045802,0.988456,0.964789,1.007353,0.968883,0.925676,1.007353,0.962079,...,1.091603,1.013466,0.94702,1.091603,0.92842,0.86755,1.0,0.934437,1.077099,1.152672
2001-02-21,0.915493,1.0,0.962963,0.954479,0.921986,1.0,0.928571,0.878378,1.0,0.918079,...,1.091603,1.02509,0.966216,1.091603,0.939068,0.885135,1.0,0.942568,1.064885,1.129771


In [21]:
y_data.head(2)

Date
2001-02-20    5.343511
2001-02-21   -3.703704
Name: Return, dtype: float64

In [22]:
extreme = 4
def series_to_binarized_columns(y):
    pos = y > extreme
    neg = y < -extreme
    meds = (y > -extreme) & (y < extreme)
    y = np.array([neg, meds, pos]).T
    return y

In [23]:
X_train = X_data[:test_date]
X_test = X_data[test_date:]
y_train = y_data[:test_date]
y_test = y_data[test_date:]

In [24]:
y_train = series_to_binarized_columns(y_train)
y_test = series_to_binarized_columns(y_test)

In [25]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test)

# KERAS

In [26]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, BatchNormalization, LSTM
from keras.optimizers import Adam, RMSprop
from keras.models import load_model

Using TensorFlow backend.


In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import class_weight

In [28]:
def expand(X):
    return np.expand_dims(X, axis=1)

In [29]:
temp_y = np.argmax(y_train, axis=1)
cw = class_weight.compute_class_weight('balanced', np.unique(temp_y), temp_y)
cw

array([ 1.98074369,  0.58801498,  1.25864979])

In [38]:
input_shape = (19, 1, X_train.shape[1])
input_shape

(19, 1, 378)

In [67]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, batch_input_shape=input_shape, stateful=True))
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=True, stateful=True))
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=False, stateful=True))
model.add(Dropout(0.2))
# model.add(LSTM(32, return_sequences=False))
model.add(Dense(3, kernel_initializer='glorot_uniform'))
model.add(Activation('softmax'))

model.compile(optimizer=Adam(lr=0.0005),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [68]:
expand(X_train).shape

(2983, 1, 378)

In [72]:
model.fit(expand(X_train), y_train, class_weight=cw, batch_size=19, epochs=300, shuffle=False, verbose=2) # , validation_data=(expand(X_test), y_test)

Epoch 1/300
 - 1s - loss: 0.7104 - acc: 0.6926
Epoch 2/300
 - 1s - loss: 0.6571 - acc: 0.7231
Epoch 3/300
 - 1s - loss: 0.6394 - acc: 0.7342
Epoch 4/300
 - 1s - loss: 0.6448 - acc: 0.7311
Epoch 5/300
 - 1s - loss: 0.6560 - acc: 0.7191
Epoch 6/300
 - 1s - loss: 0.6546 - acc: 0.7201
Epoch 7/300
 - 1s - loss: 0.6554 - acc: 0.7254
Epoch 8/300
 - 1s - loss: 0.6352 - acc: 0.7355
Epoch 9/300
 - 1s - loss: 0.6416 - acc: 0.7281
Epoch 10/300
 - 1s - loss: 0.6552 - acc: 0.7201
Epoch 11/300
 - 1s - loss: 0.6535 - acc: 0.7124
Epoch 12/300
 - 1s - loss: 0.6488 - acc: 0.7204
Epoch 13/300
 - 1s - loss: 0.6511 - acc: 0.7285
Epoch 14/300
 - 1s - loss: 0.6430 - acc: 0.7204
Epoch 15/300
 - 1s - loss: 0.6451 - acc: 0.7254
Epoch 16/300
 - 1s - loss: 0.6442 - acc: 0.7372
Epoch 17/300
 - 1s - loss: 0.6341 - acc: 0.7231
Epoch 18/300
 - 1s - loss: 0.6528 - acc: 0.7258
Epoch 19/300
 - 1s - loss: 0.6494 - acc: 0.7295
Epoch 20/300
 - 1s - loss: 0.6362 - acc: 0.7301
Epoch 21/300
 - 1s - loss: 0.6612 - acc: 0.7204
E

<keras.callbacks.History at 0x28ac229d080>

In [73]:
class ModelEvaluator:
    def __init__(self,
                 model):
        self.model = model

    def evaluate(self, X, y, ret):
        all_returns = []
        for c in [0.33 + x/50 for x in range(35)]:
            print('=================')
            print(f'Certainty is {c}')
            returns = self.calculate_returns(X, y, ret, c)
            cont = self.print_returns_distribution(returns)
            if not cont:
                break

    def calculate_returns(self, X, y, ret, certainty):
        predicted = self.model.predict(expand(X), batch_size=19)
        real_ups = y[:, 2]
        real_downs = y[:, 0]
        predicted_ups = (predicted[:, 2] > certainty) & (np.argmax(predicted, axis=1) == 2)
        predicted_downs = (predicted[:, 0] > certainty) & (np.argmax(predicted, axis=1) == 0)
        returns = np.append(ret[predicted_ups],
                            (-1 * ret[predicted_downs]))

        print('Real ups count: {}'.format(pd.value_counts(real_ups[predicted_ups])))
        print('Real downs count: {}'.format(pd.value_counts(real_downs[predicted_downs])))
        return returns

    def print_returns_distribution(self, returns):
        lose = np.sum(returns[returns < 0])
        win = np.sum(returns[returns > 0])
        if lose == 0 and win == 0:
            return False
        print('Negative returns:', str(lose))
        print('Positive returns:', str(win))
        print('Pos/Neg ratio:', str(win / (lose * -1)))
        print('Sum of returns:', str(np.sum(returns)))
        return True

In [74]:
evaluator = ModelEvaluator(model)
evaluator.evaluate(X_test[:988], y_test[:988], ret[test_date:][:988])

Certainty is 0.33
Real ups count: False    61
True      9
dtype: int64
Real downs count: False    31
True      4
dtype: int64
Negative returns: -108.879763026
Positive returns: 158.923187075
Pos/Neg ratio: 1.45962098611
Sum of returns: 50.0434240497
Certainty is 0.35000000000000003
Real ups count: False    62
True      6
dtype: int64
Real downs count: False    24
True      5
dtype: int64
Negative returns: -108.250084736
Positive returns: 143.692646806
Pos/Neg ratio: 1.32741371202
Sum of returns: 35.4425620699
Certainty is 0.37
Real ups count: False    61
True      6
dtype: int64
Real downs count: False    23
True      5
dtype: int64
Negative returns: -106.915001634
Positive returns: 142.04078745
Pos/Neg ratio: 1.32853935631
Sum of returns: 35.1257858166
Certainty is 0.39
Real ups count: False    61
True      6
dtype: int64
Real downs count: False    22
True      5
dtype: int64
Negative returns: -104.23708485
Positive returns: 138.921754987
Pos/Neg ratio: 1.3327478909
Sum of returns: 34