In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import itertools
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
aal = pdr.get_data_google('IBM', '2000-01-01', '2017-01-01')

In [3]:
aal.drop('Volume', axis=1, inplace=True)
aal.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-02-07,113.4,117.6,112.67,116.91
2001-02-08,116.91,118.2,114.03,114.1
2001-02-09,114.03,114.99,111.0,112.0
2001-02-12,110.55,115.72,110.3,114.9
2001-02-13,114.15,117.38,113.07,113.75


In [4]:
def _set_index_column_if_necessary(data: pd.DataFrame) -> pd.DataFrame:
    if 'Date' in data.columns:
        data.set_index('Date', inplace=True)
    return data

In [5]:
aal = _set_index_column_if_necessary(aal)
cols = aal.columns
aal.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-02-07,113.4,117.6,112.67,116.91
2001-02-08,116.91,118.2,114.03,114.1
2001-02-09,114.03,114.99,111.0,112.0
2001-02-12,110.55,115.72,110.3,114.9
2001-02-13,114.15,117.38,113.07,113.75


In [6]:
cols

Index(['Open', 'High', 'Low', 'Close'], dtype='object')

In [7]:
def feature(data, first_col, second_col):
    return data[first_col]/data[second_col]

In [8]:
def create_extra_columns(data, cols):
    pool = []
    for left, right in itertools.product(cols, cols):
        pair1 = left + right
        pair2 = right + left
        if left != right and pair1 not in pool and pair2 not in pool:
            aal[left + '/' + right] = feature(data, left, right)
            pool.append(pair1)

### Parameters

In [9]:
days=[5, 20]
return_days = 3
extreme = 2
test_date = datetime(2013,1,1).strftime('%Y-%m-%d')

### Rolling features

In [10]:
for col, day in itertools.product(cols, days):
    aal[col + ' ' + str(day) + ' MA'] = aal[col].rolling(day).mean()
    aal[col + ' ' + str(day) + ' max'] = aal[col].rolling(day).max()
    aal[col + ' ' + str(day) + ' min'] = aal[col].rolling(day).min()
aal.dropna(inplace=True)

In [11]:
aal.columns

Index(['Open', 'High', 'Low', 'Close', 'Open 5 MA', 'Open 5 max', 'Open 5 min',
       'Open 20 MA', 'Open 20 max', 'Open 20 min', 'High 5 MA', 'High 5 max',
       'High 5 min', 'High 20 MA', 'High 20 max', 'High 20 min', 'Low 5 MA',
       'Low 5 max', 'Low 5 min', 'Low 20 MA', 'Low 20 max', 'Low 20 min',
       'Close 5 MA', 'Close 5 max', 'Close 5 min', 'Close 20 MA',
       'Close 20 max', 'Close 20 min'],
      dtype='object')

In [12]:
plain_features = list(filter(lambda col: '/' not in col, aal.columns))

In [13]:
create_extra_columns(aal, plain_features)

In [14]:
ret = 100*aal['Close'].pct_change(return_days).shift(-return_days)

In [15]:
# ret_dev = ret.rolling(10).std().fillna(1)

# ret_scaled = ret/ret_dev
# ret_scaled.head()

In [16]:
features_to_drop = list(filter(lambda col: '/' not in col, aal.columns))
aal.drop(features_to_drop, axis=1, inplace=True)

aal['Return'] = ret # ret_scaled
aal.head(10)

Unnamed: 0_level_0,Open/High,Open/Low,Open/Close,Open/Open 5 MA,Open/Open 5 max,Open/Open 5 min,Open/Open 20 MA,Open/Open 20 max,Open/Open 20 min,Open/High 5 MA,...,Close 5 max/Close 20 MA,Close 5 max/Close 20 max,Close 5 max/Close 20 min,Close 5 min/Close 20 MA,Close 5 min/Close 20 max,Close 5 min/Close 20 min,Close 20 MA/Close 20 max,Close 20 MA/Close 20 min,Close 20 max/Close 20 min,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-03-07,0.99261,1.016941,0.99907,1.033551,1.0,1.082075,0.983209,0.919083,1.082075,1.001771,...,0.984417,0.919938,1.076577,0.936363,0.875032,1.024024,0.934501,1.093619,1.17027,-11.213389
2001-03-08,0.998416,1.017083,1.006575,1.015483,0.997394,1.042408,0.98345,0.916688,1.079255,0.997413,...,0.989143,0.920962,1.076577,0.940858,0.876006,1.024024,0.931071,1.088393,1.168969,-7.588992
2001-03-09,0.999905,1.072523,1.057508,0.993039,0.977199,1.021301,0.968831,0.898973,1.057402,0.982043,...,0.995926,0.920962,1.083191,0.919437,0.850231,1.0,0.92473,1.087622,1.176151,-4.360963
2001-03-12,0.994919,1.030309,1.025238,0.934571,0.911121,1.0,0.910092,0.838185,1.0,0.928138,...,1.003597,0.920962,1.126296,0.89106,0.817691,1.0,0.917661,1.122259,1.222955,0.073306
2001-03-13,0.966071,1.009554,0.966562,0.927588,0.885063,1.0,0.890458,0.814212,1.0,0.918947,...,1.011388,0.920962,1.126296,0.897977,0.817691,1.0,0.910593,1.113614,1.222955,-8.425653
2001-03-14,0.971689,1.029569,1.012005,0.958565,0.896706,1.010515,0.90749,0.822774,1.010515,0.945699,...,1.010157,0.911714,1.121209,0.900953,0.813153,1.0,0.902548,1.109936,1.229781,-2.485257
2001-03-15,0.980893,1.010952,1.004604,0.979392,0.914286,1.009464,0.914207,0.821918,1.009464,0.962657,...,0.950849,0.850231,1.045598,0.909383,0.813153,1.0,0.894181,1.099647,1.229781,-7.597321
2001-03-16,0.999574,1.064626,1.042175,0.980167,0.959142,1.0,0.904067,0.819729,1.0,0.962978,...,0.954423,0.855565,1.092009,0.874006,0.783478,1.0,0.896422,1.144156,1.27636,-1.132075
2001-03-19,0.980603,1.020179,0.982721,0.963779,0.94693,1.0,0.886192,0.794413,1.0,0.944081,...,0.964906,0.882422,1.092009,0.883606,0.808072,1.0,0.914516,1.131726,1.237514,-3.779698
2001-03-20,0.995328,1.039728,1.037373,0.977379,0.953174,1.006593,0.902116,0.836148,1.006593,0.963116,...,0.947936,0.877502,1.08222,0.875918,0.810836,1.0,0.925698,1.141659,1.233296,6.092865


In [17]:
len(aal.columns)

379

In [18]:
list(aal.columns)[:5]

['Open/High', 'Open/Low', 'Open/Close', 'Open/Open 5 MA', 'Open/Open 5 max']

# X/y spit

In [19]:
aal.dropna(inplace=True)
X_data = aal.drop('Return', axis=1)
y_data = aal['Return']

In [20]:
X_data.head(2)

Unnamed: 0_level_0,Open/High,Open/Low,Open/Close,Open/Open 5 MA,Open/Open 5 max,Open/Open 5 min,Open/Open 20 MA,Open/Open 20 max,Open/Open 20 min,Open/High 5 MA,...,Close 5 max/Close 5 min,Close 5 max/Close 20 MA,Close 5 max/Close 20 max,Close 5 max/Close 20 min,Close 5 min/Close 20 MA,Close 5 min/Close 20 max,Close 5 min/Close 20 min,Close 20 MA/Close 20 max,Close 20 MA/Close 20 min,Close 20 max/Close 20 min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-03-07,0.99261,1.016941,0.99907,1.033551,1.0,1.082075,0.983209,0.919083,1.082075,1.001771,...,1.05132,0.984417,0.919938,1.076577,0.936363,0.875032,1.024024,0.934501,1.093619,1.17027
2001-03-08,0.998416,1.017083,1.006575,1.015483,0.997394,1.042408,0.98345,0.916688,1.079255,0.997413,...,1.05132,0.989143,0.920962,1.076577,0.940858,0.876006,1.024024,0.931071,1.088393,1.168969


In [21]:
y_data.head(2)

Date
2001-03-07   -11.213389
2001-03-08    -7.588992
Name: Return, dtype: float64

In [22]:
def series_to_binarized_columns(y):
    pos = y > extreme
    neg = y < -extreme
    meds = (y > -extreme) & (y < extreme)
    y = np.array([neg, meds, pos]).T
    return y

In [23]:
X_train = X_data[:test_date]
X_test = X_data[test_date:]
y_train = y_data[:test_date]
y_test = y_data[test_date:]

In [24]:
y_train = series_to_binarized_columns(y_train)
y_test = series_to_binarized_columns(y_test)

In [25]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test)

# KERAS

In [26]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, BatchNormalization, LSTM
from keras.optimizers import Adam, RMSprop
from keras.models import load_model

Using TensorFlow backend.


In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import class_weight

In [28]:
def expand(X, axis=1):
    return np.expand_dims(X, axis=axis)

In [29]:
temp_y = np.argmax(y_train, axis=1)
cw = class_weight.compute_class_weight('balanced', np.unique(temp_y), temp_y)
cw

array([ 2.10403397,  0.50151822,  1.88403042])

In [30]:
input_shape = (1, 1, X_train.shape[1])
input_shape

(1, 1, 378)

In [31]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, batch_input_shape=input_shape, stateful=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False, stateful=True))
model.add(Dropout(0.2))
model.add(Dense(3, kernel_initializer='glorot_uniform'))
model.add(Activation('softmax'))

model.compile(optimizer=Adam(lr=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [32]:
for i in range(20):
    print(f'Iteration {i}')
    model.fit(expand(X_train), y_train, class_weight=cw, batch_size=1, epochs=5, shuffle=False, verbose=2)
    model.reset_states()

#model.fit(expand(X_train), y_train, class_weight=cw, batch_size=1, epochs=100, shuffle=False, verbose=2) # , validation_data=(expand(X_test), y_test)

Iteration 0
Epoch 1/5
 - 12s - loss: 0.8506 - acc: 0.6670
Epoch 2/5
 - 11s - loss: 0.8359 - acc: 0.6626
Epoch 3/5
 - 11s - loss: 0.8365 - acc: 0.6616
Epoch 4/5
 - 10s - loss: 0.8272 - acc: 0.6650
Epoch 5/5
 - 10s - loss: 0.8259 - acc: 0.6694
Iteration 1
Epoch 1/5
 - 10s - loss: 0.8285 - acc: 0.6727
Epoch 2/5
 - 10s - loss: 0.8189 - acc: 0.6747
Epoch 3/5
 - 11s - loss: 0.8209 - acc: 0.6720
Epoch 4/5
 - 11s - loss: 0.8117 - acc: 0.6751
Epoch 5/5
 - 12s - loss: 0.8090 - acc: 0.6794
Iteration 2
Epoch 1/5
 - 12s - loss: 0.8069 - acc: 0.6781
Epoch 2/5
 - 11s - loss: 0.8051 - acc: 0.6768
Epoch 3/5
 - 11s - loss: 0.8006 - acc: 0.6808
Epoch 4/5
 - 11s - loss: 0.7932 - acc: 0.6805
Epoch 5/5
 - 12s - loss: 0.7852 - acc: 0.6858
Iteration 3
Epoch 1/5
 - 12s - loss: 0.7882 - acc: 0.6868
Epoch 2/5
 - 12s - loss: 0.7772 - acc: 0.6949
Epoch 3/5
 - 11s - loss: 0.7787 - acc: 0.6956
Epoch 4/5
 - 12s - loss: 0.7760 - acc: 0.6939
Epoch 5/5
 - 12s - loss: 0.7714 - acc: 0.6986
Iteration 4
Epoch 1/5
 - 12s - l

In [33]:
class ModelEvaluator:
    def __init__(self,
                 model):
        self.model = model

    def evaluate(self, X, y, ret):
        all_returns = []
        for c in [0.34 + x/50 for x in range(35)]:
            print('=================')
            print(f'Certainty is {c}')
            returns = self.calculate_returns(X, y, ret, c)
            cont = self.print_returns_distribution(returns)
            if not cont:
                break

    def calculate_returns(self, X, y, ret, certainty):
        predicted = self.model.predict(expand(X), batch_size=1)
        real_ups = y[:, 2]
        real_downs = y[:, 0]
        predicted_ups = (predicted[:, 2] > certainty) & (np.argmax(predicted, axis=1) == 2)
        predicted_downs = (predicted[:, 0] > certainty) & (np.argmax(predicted, axis=1) == 0)
        returns = np.append(ret[predicted_ups],
                            (-1 * ret[predicted_downs]))

        print('Real ups count: {}'.format(pd.value_counts(real_ups[predicted_ups])))
        print('Real downs count: {}'.format(pd.value_counts(real_downs[predicted_downs])))
        return returns

    def print_returns_distribution(self, returns):
        lose = np.sum(returns[returns < 0])
        win = np.sum(returns[returns > 0])
        if lose == 0 and win == 0:
            return False
        print('Negative returns:', str(lose))
        print('Positive returns:', str(win))
        print('Pos/Neg ratio:', str(win / (lose * -1)))
        print('Sum of returns:', str(np.sum(returns)))
        return True

In [34]:
evaluator = ModelEvaluator(model)
evaluator.evaluate(X_test, y_test, y_data[test_date:]) # [:988]

Certainty is 0.34
Real ups count: False    33
True      8
dtype: int64
Real downs count: False    17
True      8
dtype: int64
Negative returns: -24.4120984114
Positive returns: 85.339459135
Pos/Neg ratio: 3.49578547885
Sum of returns: 60.9273607236
Certainty is 0.36000000000000004
Real ups count: False    33
True      8
dtype: int64
Real downs count: False    17
True      8
dtype: int64
Negative returns: -24.4120984114
Positive returns: 85.339459135
Pos/Neg ratio: 3.49578547885
Sum of returns: 60.9273607236
Certainty is 0.38
Real ups count: False    33
True      8
dtype: int64
Real downs count: False    17
True      8
dtype: int64
Negative returns: -24.4120984114
Positive returns: 85.339459135
Pos/Neg ratio: 3.49578547885
Sum of returns: 60.9273607236
Certainty is 0.4
Real ups count: False    33
True      8
dtype: int64
Real downs count: False    17
True      8
dtype: int64
Negative returns: -24.4120984114
Positive returns: 85.339459135
Pos/Neg ratio: 3.49578547885
Sum of returns: 60.9



Real ups count: False    1
dtype: int64
Real downs count: False    1
dtype: int64
Negative returns: 0.0
Positive returns: 3.02607022138
Pos/Neg ratio: -inf
Sum of returns: 3.02607022138
Certainty is 0.94
Real ups count: Series([], dtype: int64)
Real downs count: Series([], dtype: int64)
