In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from nasdaqpredictor import dataloader as dl

[2017-12-04 14:44:06,814 - nasdaqpredictor - DEBUG] - Logging initialized


In [3]:
loader = dl.DataLoader('/nasdaq_tickers.csv',
                        datetime(2000, 1, 1),
                        datetime(2017, 1, 1))
# transformer = DataTransformer(loader, return_shift_days=-3)

In [4]:
loader.reload_all()

[2017-12-04 14:44:07,168 - nasdaqpredictor.dataloader - INFO] - Load tickers


In [5]:
aal = loader.original_data_dict['AAL']

In [6]:
aal.head()

Unnamed: 0,Date,Open,High,Low,Close
0,2005-09-27,21.049999,21.4,19.1,19.299999
1,2005-09-28,19.299999,20.530001,19.200001,20.5
2,2005-09-29,20.4,20.58,20.1,20.209999
3,2005-09-30,20.26,21.049999,20.18,21.01
4,2005-10-03,20.9,21.75,20.9,21.5


In [7]:
def _set_index_column_if_necessary(data: pd.DataFrame) -> pd.DataFrame:
    if 'Date' in data.columns:
        data.set_index('Date', inplace=True)
    return data

In [8]:
aal = _set_index_column_if_necessary(aal)
aal.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-09-27,21.049999,21.4,19.1,19.299999
2005-09-28,19.299999,20.530001,19.200001,20.5
2005-09-29,20.4,20.58,20.1,20.209999
2005-09-30,20.26,21.049999,20.18,21.01
2005-10-03,20.9,21.75,20.9,21.5


In [9]:
return_days = 2

In [10]:
def feature(data, first_col, second_col, base_col):
    return (data[first_col]-data[second_col])/data[base_col]

In [11]:
aal['OC diff'] = feature(aal, 'Open', 'Close', 'Close')
aal['HL diff'] = feature(aal, 'High', 'Low', 'Close')
aal['OL diff'] = feature(aal, 'Open', 'Low', 'Close')
aal['CH diff'] = feature(aal, 'Close', 'High', 'Close')
aal['Return'] = 100*aal['Close'].pct_change(return_days).shift(-return_days)
aal.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,OC diff,HL diff,OL diff,CH diff,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-09-27,21.049999,21.4,19.1,19.299999,0.090674,0.119171,0.101036,-0.108808,4.715026
2005-09-28,19.299999,20.530001,19.200001,20.5,-0.058537,0.064878,0.004878,-0.001463,2.487805
2005-09-29,20.4,20.58,20.1,20.209999,0.009401,0.023751,0.014844,-0.018308,6.382984
2005-09-30,20.26,21.049999,20.18,21.01,-0.035697,0.041409,0.003808,-0.001904,5.473584
2005-10-03,20.9,21.75,20.9,21.5,-0.027907,0.039535,0.0,-0.011628,3.255819
2005-10-04,21.440001,22.5,21.440001,22.16,-0.032491,0.047834,0.0,-0.015343,1.895307
2005-10-05,22.1,22.309999,21.75,22.200001,-0.004505,0.025225,0.015766,-0.004955,-0.22523
2005-10-06,22.6,23.0,22.4,22.58,0.000886,0.026572,0.008857,-0.018601,-1.638623
2005-10-07,22.25,22.6,21.799999,22.15,0.004515,0.036117,0.020316,-0.020316,-0.767494
2005-10-10,22.280001,22.290001,22.1,22.209999,0.003152,0.008555,0.008105,-0.003602,-2.02611


In [12]:
aal.iloc[:,0:4].pct_change().head(2)

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-09-27,,,,
2005-09-28,-0.083135,-0.040654,0.005236,0.062176


In [13]:
aal.iloc[:,4:8].head(2)

Unnamed: 0_level_0,OC diff,HL diff,OL diff,CH diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-09-27,0.090674,0.119171,0.101036,-0.108808
2005-09-28,-0.058537,0.064878,0.004878,-0.001463


In [14]:
full = pd.concat((aal.iloc[:,0:4].pct_change(), aal.iloc[:,4:8], aal['Return']), axis=1)
full.head()

Unnamed: 0_level_0,Open,High,Low,Close,OC diff,HL diff,OL diff,CH diff,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-09-27,,,,,0.090674,0.119171,0.101036,-0.108808,4.715026
2005-09-28,-0.083135,-0.040654,0.005236,0.062176,-0.058537,0.064878,0.004878,-0.001463,2.487805
2005-09-29,0.056995,0.002435,0.046875,-0.014146,0.009401,0.023751,0.014844,-0.018308,6.382984
2005-09-30,-0.006863,0.022838,0.00398,0.039584,-0.035697,0.041409,0.003808,-0.001904,5.473584
2005-10-03,0.031589,0.033254,0.035679,0.023322,-0.027907,0.039535,0.0,-0.011628,3.255819


In [15]:
full.tail()

Unnamed: 0_level_0,Open,High,Low,Close,OC diff,HL diff,OL diff,CH diff,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-12-23,-0.014995,-0.012336,-0.003508,-0.004313,0.002682,0.011345,0.006601,-0.007426,-1.670796
2016-12-27,-0.006583,0.001638,-0.003106,0.002682,-0.006583,0.016046,0.003086,-0.006377,-2.79778
2016-12-28,0.009112,-0.00184,-0.01101,-0.019338,0.022236,0.025593,0.023495,-0.024334,-2.055798
2016-12-29,-0.019085,-0.016998,-0.013653,-0.008811,0.01164,0.022011,0.017778,-0.015873,
2016-12-30,-0.00795,-0.007083,-0.010434,-0.011852,0.015635,0.025487,0.020347,-0.020775,


In [16]:
full = full.iloc[1:-return_days]
full.head()

Unnamed: 0_level_0,Open,High,Low,Close,OC diff,HL diff,OL diff,CH diff,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-09-28,-0.083135,-0.040654,0.005236,0.062176,-0.058537,0.064878,0.004878,-0.001463,2.487805
2005-09-29,0.056995,0.002435,0.046875,-0.014146,0.009401,0.023751,0.014844,-0.018308,6.382984
2005-09-30,-0.006863,0.022838,0.00398,0.039584,-0.035697,0.041409,0.003808,-0.001904,5.473584
2005-10-03,0.031589,0.033254,0.035679,0.023322,-0.027907,0.039535,0.0,-0.011628,3.255819
2005-10-04,0.025837,0.034483,0.025837,0.030698,-0.032491,0.047834,0.0,-0.015343,1.895307


In [17]:
full.tail()

Unnamed: 0_level_0,Open,High,Low,Close,OC diff,HL diff,OL diff,CH diff,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-12-21,0.01732,0.008081,0.015052,0.005079,-0.002628,0.013544,0.002224,-0.008692,-2.001215
2016-12-22,0.000203,-0.009018,-0.015641,-0.015767,0.013555,0.020333,0.018279,-0.015609,-0.164301
2016-12-23,-0.014995,-0.012336,-0.003508,-0.004313,0.002682,0.011345,0.006601,-0.007426,-1.670796
2016-12-27,-0.006583,0.001638,-0.003106,0.002682,-0.006583,0.016046,0.003086,-0.006377,-2.79778
2016-12-28,0.009112,-0.00184,-0.01101,-0.019338,0.022236,0.025593,0.023495,-0.024334,-2.055798


In [18]:
full = full.replace([np.inf, -np.inf, np.NaN, np.NAN], 0.0)

In [55]:
full.iloc[29]

Open        0.003448
High        0.009174
Low         0.021846
Close       0.017283
OC diff    -0.011213
HL diff     0.023785
OL diff     0.003398
CH diff    -0.009174
Return     11.450897
Name: 2005-11-08, dtype: float64

In [130]:
n = 30
test_date = datetime(2015,1,1).strftime('%Y-%m-%d')
X_data = full.drop('Return', axis=1)
y_data = full['Return']

In [131]:
X_data.head(2)

Unnamed: 0_level_0,Open,High,Low,Close,OC diff,HL diff,OL diff,CH diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-09-28,-0.083135,-0.040654,0.005236,0.062176,-0.058537,0.064878,0.004878,-0.001463
2005-09-29,0.056995,0.002435,0.046875,-0.014146,0.009401,0.023751,0.014844,-0.018308


In [132]:
y_data.head(2)

Date
2005-09-28    2.487805
2005-09-29    6.382984
Name: Return, dtype: float64

In [133]:
X_train = X_data[:test_date]
X_test = X_data[test_date:]
y_train = y_data[:test_date].iloc[n-1:]
y_test = y_data[test_date:].iloc[n-1:]

In [134]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test)

In [135]:
def build_2D_input_data(input):
    return [input[i:i + n] for i in range(0, input.shape[0] - n + 1)]

In [136]:
X_train = build_2D_input_data(X_train)
X_test = build_2D_input_data(X_test)

In [137]:
X_train = np.stack(X_train)
X_test = np.stack(X_test)

In [143]:
X_train.shape[1:]

(30, 8)

In [139]:
X_test.shape

(473, 30, 8)

In [140]:
y_train.shape

(2302,)

In [141]:
y_test.shape

(473,)

# KERAS

In [28]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, BatchNormalization, Flatten
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback
from keras.models import load_model
import keras.backend as K
from keras.layers import Conv1D, MaxPool1D

Using TensorFlow backend.


In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

In [30]:
original_shape = X.shape

In [31]:
scaler = StandardScaler()

In [32]:
X = X.reshape(original_shape[0]*original_shape[1],original_shape[2])

In [33]:
X = scaler.fit_transform(X)
X = X.reshape(original_shape)
X.shape

(2803, 30, 8)

In [34]:
extreme = 4
def series_to_binarized_columns(y):
    pos = y > extreme
    neg = y < -extreme
    meds = (y > -extreme) & (y < extreme)
    y = np.array([neg, meds, pos]).T
    return y

In [35]:
y = series_to_binarized_columns(y)

In [36]:
model = Sequential()

In [37]:
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu', kernel_initializer='uniform', input_shape=(30,8)))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu', kernel_initializer='uniform'))
model.add(MaxPool1D(pool_size=2, padding='same'))

model.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu', kernel_initializer='uniform'))
model.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu', kernel_initializer='uniform'))
model.add(MaxPool1D(pool_size=2, padding='same'))

model.add(Conv1D(filters=256, kernel_size=5, padding='same', activation='relu', kernel_initializer='uniform'))
model.add(Conv1D(filters=256, kernel_size=5, padding='same', activation='relu', kernel_initializer='uniform'))
model.add(MaxPool1D(pool_size=2, padding='same'))

In [38]:
model.add(Flatten())

In [39]:
model.add(Dense(50))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(50))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(3, kernel_initializer='uniform'))
model.add(Activation('softmax'))

model.compile(optimizer=Adam(),
              #loss=self.create_entropy(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [40]:
model.fit(X, y, batch_size=64, epochs=100, verbose=2)

Epoch 1/100
 - 18s - loss: 0.9621 - acc: 0.6022
Epoch 2/100
 - 1s - loss: 0.8823 - acc: 0.6218
Epoch 3/100
 - 1s - loss: 0.8725 - acc: 0.6190
Epoch 4/100
 - 1s - loss: 0.8703 - acc: 0.6250
Epoch 5/100
 - 1s - loss: 0.8642 - acc: 0.6261
Epoch 6/100
 - 1s - loss: 0.8627 - acc: 0.6304
Epoch 7/100
 - 1s - loss: 0.8601 - acc: 0.6315
Epoch 8/100
 - 1s - loss: 0.8536 - acc: 0.6308
Epoch 9/100
 - 1s - loss: 0.8320 - acc: 0.6386
Epoch 10/100
 - 1s - loss: 0.8186 - acc: 0.6429
Epoch 11/100
 - 1s - loss: 0.8108 - acc: 0.6482
Epoch 12/100
 - 1s - loss: 0.7868 - acc: 0.6504
Epoch 13/100
 - 1s - loss: 0.7619 - acc: 0.6707
Epoch 14/100
 - 1s - loss: 0.7308 - acc: 0.6846
Epoch 15/100
 - 1s - loss: 0.6961 - acc: 0.6978
Epoch 16/100
 - 1s - loss: 0.6439 - acc: 0.7189
Epoch 17/100
 - 1s - loss: 0.6105 - acc: 0.7392
Epoch 18/100
 - 1s - loss: 0.5678 - acc: 0.7656
Epoch 19/100
 - 1s - loss: 0.5092 - acc: 0.8006
Epoch 20/100
 - 1s - loss: 0.4780 - acc: 0.8102
Epoch 21/100
 - 1s - loss: 0.4434 - acc: 0.8252


<keras.callbacks.History at 0x2150c39f358>