In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import itertools
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
aal = pdr.get_data_google('AAPL', '2000-01-01', '2017-01-01')

In [3]:
aal.drop('Volume', axis=1, inplace=True)
aal.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-02-06,1.44,1.53,1.43,1.51
2001-02-07,1.48,1.49,1.42,1.48
2001-02-08,1.47,1.5,1.44,1.48
2001-02-09,1.46,1.49,1.33,1.37
2001-02-12,1.36,1.43,1.34,1.41


In [4]:
def _set_index_column_if_necessary(data: pd.DataFrame) -> pd.DataFrame:
    if 'Date' in data.columns:
        data.set_index('Date', inplace=True)
    return data

In [5]:
aal = _set_index_column_if_necessary(aal)
cols = aal.columns
aal.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-02-06,1.44,1.53,1.43,1.51
2001-02-07,1.48,1.49,1.42,1.48
2001-02-08,1.47,1.5,1.44,1.48
2001-02-09,1.46,1.49,1.33,1.37
2001-02-12,1.36,1.43,1.34,1.41


In [6]:
cols

Index(['Open', 'High', 'Low', 'Close'], dtype='object')

In [7]:
def feature(data, first_col, second_col):
    return data[first_col]/data[second_col]

In [8]:
def create_extra_columns(data, cols):
    pool = []
    for left, right in itertools.product(cols, cols):
        pair1 = left + right
        pair2 = right + left
        if left != right and pair1 not in pool and pair2 not in pool:
            aal[left + '/' + right] = feature(data, left, right)
            pool.append(pair1)

### Parameters

In [9]:
days=[5, 10]
return_days = 2
extreme = 2
test_date = datetime(2013,1,1).strftime('%Y-%m-%d')

### Rolling features

In [10]:
for col, day in itertools.product(cols, days):
    aal[col + ' ' + str(day) + ' MA'] = aal[col].rolling(day).mean()
    aal[col + ' ' + str(day) + ' max'] = aal[col].rolling(day).max()
    aal[col + ' ' + str(day) + ' min'] = aal[col].rolling(day).min()
aal.dropna(inplace=True)

In [11]:
aal.columns

Index(['Open', 'High', 'Low', 'Close', 'Open 5 MA', 'Open 5 max', 'Open 5 min',
       'Open 10 MA', 'Open 10 max', 'Open 10 min', 'High 5 MA', 'High 5 max',
       'High 5 min', 'High 10 MA', 'High 10 max', 'High 10 min', 'Low 5 MA',
       'Low 5 max', 'Low 5 min', 'Low 10 MA', 'Low 10 max', 'Low 10 min',
       'Close 5 MA', 'Close 5 max', 'Close 5 min', 'Close 10 MA',
       'Close 10 max', 'Close 10 min'],
      dtype='object')

In [38]:
plain_features = list(filter(lambda col: '/' not in col, aal.columns))

In [13]:
create_extra_columns(aal, plain_features)

In [14]:
ret = 100*aal['Close'].pct_change(return_days).shift(-return_days)

In [15]:
# ret_dev = ret.rolling(10).std().fillna(1)

# ret_scaled = ret/ret_dev
# ret_scaled.head()

In [16]:
features_to_drop = list(filter(lambda col: '/' not in col, aal.columns))
aal.drop(features_to_drop, axis=1, inplace=True)

aal['Return'] = ret # ret_scaled
aal.head(10)

Unnamed: 0_level_0,Open/High,Open/Low,Open/Close,Open/Open 5 MA,Open/Open 5 max,Open/Open 5 min,Open/Open 10 MA,Open/Open 10 max,Open/Open 10 min,Open/High 5 MA,...,Close 5 max/Close 10 MA,Close 5 max/Close 10 max,Close 5 max/Close 10 min,Close 5 min/Close 10 MA,Close 5 min/Close 10 max,Close 5 min/Close 10 min,Close 10 MA/Close 10 max,Close 10 MA/Close 10 min,Close 10 max/Close 10 min,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-02-20,0.985612,1.053846,1.045802,0.988456,0.964789,1.007353,0.968883,0.925676,1.007353,0.962079,...,1.013466,0.94702,1.091603,0.92842,0.86755,1.0,0.934437,1.077099,1.152672,2.290076
2001-02-21,0.915493,1.0,0.962963,0.954479,0.921986,1.0,0.928571,0.878378,1.0,0.918079,...,1.02509,0.966216,1.091603,0.939068,0.885135,1.0,0.942568,1.064885,1.129771,-0.740741
2001-02-22,0.985507,1.054264,1.014925,1.0,0.964539,1.046154,0.979827,0.92517,1.046154,0.964539,...,1.035482,0.966216,1.091603,0.948588,0.885135,1.0,0.933108,1.054198,1.129771,3.731343
2001-02-23,0.985185,1.023077,0.992537,0.989583,0.970803,1.023077,0.967977,0.910959,1.023077,0.959596,...,0.994879,0.951049,1.038168,0.958303,0.916084,1.0,0.955944,1.043511,1.091603,2.985075
2001-02-26,0.964539,1.022556,0.978417,1.011905,0.992701,1.046154,0.997067,0.957746,1.046154,0.978417,...,1.01534,0.972028,1.061069,0.956903,0.916084,1.0,0.957343,1.045038,1.091603,-6.47482
2001-02-27,0.992806,1.037594,1.0,1.02526,1.0,1.061538,1.010249,0.971831,1.061538,0.992806,...,1.01757,0.972028,1.061069,0.980966,0.937063,1.022901,0.955245,1.042748,1.091603,-2.898551
2001-02-28,0.992806,1.069767,1.061538,1.013216,1.0,1.037594,1.013216,0.978723,1.061538,0.99711,...,1.022811,0.972028,1.069231,0.956586,0.909091,1.0,0.95035,1.045385,1.1,6.153846
2001-03-01,0.947761,1.03252,0.947761,0.94494,0.92029,1.0,0.939349,0.900709,1.0,0.922965,...,1.026588,0.972028,1.069231,0.960118,0.909091,1.0,0.946853,1.041538,1.1,8.955224
2001-03-02,0.89726,1.007692,0.949275,0.977612,0.949275,1.031496,0.976155,0.949275,1.031496,0.937053,...,1.030393,1.0,1.069231,0.963677,0.935252,1.0,0.970504,1.037692,1.069231,11.594203
2001-03-05,0.945205,1.007299,0.945205,1.026786,1.0,1.086614,1.026786,1.0,1.086614,0.980114,...,1.074319,1.0,1.123077,0.956586,0.890411,1.0,0.930822,1.045385,1.123077,4.109589


In [17]:
len(aal.columns)

379

In [18]:
list(aal.columns)[:5]

['Open/High', 'Open/Low', 'Open/Close', 'Open/Open 5 MA', 'Open/Open 5 max']

# X/y spit

In [19]:
aal.dropna(inplace=True)
X_data = aal.drop('Return', axis=1)
y_data = aal['Return']

In [20]:
X_data.head(2)

Unnamed: 0_level_0,Open/High,Open/Low,Open/Close,Open/Open 5 MA,Open/Open 5 max,Open/Open 5 min,Open/Open 10 MA,Open/Open 10 max,Open/Open 10 min,Open/High 5 MA,...,Close 5 max/Close 5 min,Close 5 max/Close 10 MA,Close 5 max/Close 10 max,Close 5 max/Close 10 min,Close 5 min/Close 10 MA,Close 5 min/Close 10 max,Close 5 min/Close 10 min,Close 10 MA/Close 10 max,Close 10 MA/Close 10 min,Close 10 max/Close 10 min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-02-20,0.985612,1.053846,1.045802,0.988456,0.964789,1.007353,0.968883,0.925676,1.007353,0.962079,...,1.091603,1.013466,0.94702,1.091603,0.92842,0.86755,1.0,0.934437,1.077099,1.152672
2001-02-21,0.915493,1.0,0.962963,0.954479,0.921986,1.0,0.928571,0.878378,1.0,0.918079,...,1.091603,1.02509,0.966216,1.091603,0.939068,0.885135,1.0,0.942568,1.064885,1.129771


In [21]:
y_data.head(2)

Date
2001-02-20    2.290076
2001-02-21   -0.740741
Name: Return, dtype: float64

In [22]:
def series_to_binarized_columns(y):
    pos = y > extreme
    neg = y < -extreme
    meds = (y > -extreme) & (y < extreme)
    y = np.array([neg, meds, pos]).T
    return y

In [23]:
X_train = X_data[:test_date]
X_test = X_data[test_date:]
y_train = y_data[:test_date]
y_test = y_data[test_date:]

In [24]:
y_train = series_to_binarized_columns(y_train)
y_test = series_to_binarized_columns(y_test)

In [25]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test)

# KERAS

In [26]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, BatchNormalization, LSTM
from keras.optimizers import Adam, RMSprop
from keras.models import load_model

Using TensorFlow backend.


In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import class_weight

In [28]:
def expand(X, axis=1):
    return np.expand_dims(X, axis=axis)

In [29]:
temp_y = np.argmax(y_train, axis=1)
cw = class_weight.compute_class_weight('balanced', np.unique(temp_y), temp_y)
cw

array([ 1.50656566,  0.6750396 ,  1.16980392])

In [30]:
input_shape = (1, 1, X_train.shape[1])
input_shape

(1, 1, 378)

In [31]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, batch_input_shape=input_shape, stateful=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False, stateful=True))
model.add(Dropout(0.2))
model.add(Dense(3, kernel_initializer='glorot_uniform'))
model.add(Activation('softmax'))

model.compile(optimizer=Adam(lr=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [40]:
for i in range(20):
    print(f'Iteration {i}')
    model.fit(expand(X_train), y_train, class_weight=cw, batch_size=1, epochs=5, shuffle=False, verbose=2)
    model.reset_states()

#model.fit(expand(X_train), y_train, class_weight=cw, batch_size=1, epochs=100, shuffle=False, verbose=2) # , validation_data=(expand(X_test), y_test)

Iteration 0
Epoch 1/5
 - 11s - loss: 0.7929 - acc: 0.6581
Epoch 2/5
 - 10s - loss: 0.7690 - acc: 0.6587
Epoch 3/5
 - 11s - loss: 0.7946 - acc: 0.6490
Epoch 4/5
 - 11s - loss: 0.7974 - acc: 0.6450
Epoch 5/5
 - 11s - loss: 0.7769 - acc: 0.6534
Iteration 1
Epoch 1/5
 - 11s - loss: 0.7848 - acc: 0.6504
Epoch 2/5
 - 11s - loss: 0.7914 - acc: 0.6447
Epoch 3/5
 - 12s - loss: 0.7858 - acc: 0.6564
Epoch 4/5
 - 12s - loss: 0.7944 - acc: 0.6400
Epoch 5/5
 - 12s - loss: 0.7625 - acc: 0.6718
Iteration 2
Epoch 1/5
 - 11s - loss: 0.7775 - acc: 0.6607
Epoch 2/5
 - 11s - loss: 0.7818 - acc: 0.6574
Epoch 3/5
 - 11s - loss: 0.7770 - acc: 0.6547
Epoch 4/5
 - 11s - loss: 0.7820 - acc: 0.6534
Epoch 5/5
 - 11s - loss: 0.7896 - acc: 0.6423
Iteration 3
Epoch 1/5
 - 11s - loss: 0.7703 - acc: 0.6514
Epoch 2/5
 - 11s - loss: 0.7931 - acc: 0.6480
Epoch 3/5
 - 11s - loss: 0.7689 - acc: 0.6624
Epoch 4/5
 - 11s - loss: 0.7747 - acc: 0.6581
Epoch 5/5
 - 11s - loss: 0.7798 - acc: 0.6527
Iteration 4
Epoch 1/5
 - 11s - l

In [41]:
class ModelEvaluator:
    def __init__(self,
                 model):
        self.model = model

    def evaluate(self, X, y, ret):
        all_returns = []
        for c in [0.33 + x/50 for x in range(35)]:
            print('=================')
            print(f'Certainty is {c}')
            returns = self.calculate_returns(X, y, ret, c)
            cont = self.print_returns_distribution(returns)
            if not cont:
                break

    def calculate_returns(self, X, y, ret, certainty):
        predicted = self.model.predict(expand(X), batch_size=1)
        real_ups = y[:, 2]
        real_downs = y[:, 0]
        predicted_ups = (predicted[:, 2] > certainty) & (np.argmax(predicted, axis=1) == 2)
        predicted_downs = (predicted[:, 0] > certainty) & (np.argmax(predicted, axis=1) == 0)
        returns = np.append(ret[predicted_ups],
                            (-1 * ret[predicted_downs]))

        print('Real ups count: {}'.format(pd.value_counts(real_ups[predicted_ups])))
        print('Real downs count: {}'.format(pd.value_counts(real_downs[predicted_downs])))
        return returns

    def print_returns_distribution(self, returns):
        lose = np.sum(returns[returns < 0])
        win = np.sum(returns[returns > 0])
        if lose == 0 and win == 0:
            return False
        print('Negative returns:', str(lose))
        print('Positive returns:', str(win))
        print('Pos/Neg ratio:', str(win / (lose * -1)))
        print('Sum of returns:', str(np.sum(returns)))
        return True

In [42]:
evaluator = ModelEvaluator(model)
evaluator.evaluate(X_test, y_test, y_data[test_date:]) # [:988]

Certainty is 0.33
Real ups count: False    87
True     32
dtype: int64
Real downs count: False    55
True     13
dtype: int64
Negative returns: -134.352541432
Positive returns: 197.050396675
Pos/Neg ratio: 1.46666668583
Sum of returns: 62.6978552429
Certainty is 0.35000000000000003
Real ups count: False    84
True     31
dtype: int64
Real downs count: False    59
True     15
dtype: int64
Negative returns: -137.501606614
Positive returns: 205.058775407
Pos/Neg ratio: 1.49131912315
Sum of returns: 67.5571687937
Certainty is 0.37
Real ups count: False    84
True     30
dtype: int64
Real downs count: False    57
True     15
dtype: int64
Negative returns: -135.812691557
Positive returns: 201.37289252
Pos/Neg ratio: 1.48272514308
Sum of returns: 65.5602009637
Certainty is 0.39
Real ups count: False    84
True     29
dtype: int64
Real downs count: False    55
True     15
dtype: int64
Negative returns: -132.403422114
Positive returns: 195.62178333
Pos/Neg ratio: 1.4774677286
Sum of returns: 63



Real ups count: Series([], dtype: int64)
Real downs count: True    1
dtype: int64
Negative returns: 0.0
Positive returns: 2.1906764809
Pos/Neg ratio: -inf
Sum of returns: 2.1906764809
Certainty is 1.01
Real ups count: Series([], dtype: int64)
Real downs count: Series([], dtype: int64)
