In [2416]:
from pandas_datareader.data import DataReader
from datetime import datetime
import os
import pandas as pd
import random
import numpy as np
from keras.models import Sequential
from keras.layers.recurrent import LSTM,GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

from keras.utils.np_utils import to_categorical

In [2417]:
random.seed(123)
np.random.seed(123)

loading or downloading the data

In [2418]:
def get_data_if_not_exists(force=False):
    if os.path.exists("./data/ibm.csv") and not force:
        return pd.read_csv("./data/ibm.csv")
    else:
        if not os.path.exists("./data"):
            os.mkdir("data")
        ibm_data = DataReader('IBM', 'yahoo', datetime(1950, 1, 1), datetime.today())
        pd.DataFrame(ibm_data).to_csv("./data/ibm.csv")
        return pd.DataFrame(ibm_data).reset_index()

## exploring the data

In [2419]:
print "loading the data"
data = get_data_if_not_exists(force=True)
print "done loading the data"

loading the data
done loading the data


In [2420]:
print "data columns names: %s"%data.columns.values

data columns names: ['Date' 'Open' 'High' 'Low' 'Close' 'Volume' 'Adj Close']


In [2421]:
print data.shape
data.head()

(13732, 7)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,1962-01-02,578.499734,578.499734,572.000241,572.000241,387200,2.300695
1,1962-01-03,572.000241,576.999736,572.000241,576.999736,288000,2.320804
2,1962-01-04,576.999736,576.999736,570.999742,571.25026,256000,2.297679
3,1962-01-05,570.500243,570.500243,558.999753,560.000253,363200,2.252429
4,1962-01-08,559.500003,559.500003,545.000267,549.500263,544000,2.210196


found out that Adj Close is not the last day close. it's the "true" close of the day.

## preparing the data

In [2422]:
for i in range(1,len(data)):
    prev = data.iloc[i-1]
    data.set_value(i,"prev_close",prev["Close"])

In [2423]:
data["up/down"] = (data["Close"] - data["prev_close"]) > 0

In [2424]:
data["raise_percentage"] = (data["Close"] - data["prev_close"])/data["prev_close"]

In [2425]:
data["spread"] = abs(data["High"]-data["Low"])

In [2426]:
data["up_spread"] = abs(data["High"]-data["Open"])

In [2427]:
data["down_spread"] = abs(data["Open"]-data["Low"])

In [2428]:
import re
for i in range(1,len(data)):
    prev = data.iloc[i-1]
    data.set_value(i,"prev_open",prev["Open"])
    data.set_value(i,"prev_high",prev["High"])
    data.set_value(i,"prev_low",prev["Low"])
#     data.set_value(i,"month",re.findall("[1-9]+", str(data.Date[i]))[2])
#     data.set_value(i,"year",re.findall("[1-9]+", str(data.Date[i]))[0])
    
#     prev = data.iloc[i-2]
#     data.set_value(i,"prev_prev_open",prev["Open"])
#     data.set_value(i,"prev_prev_high",prev["High"])
#     data.set_value(i,"prev_prev_low",prev["Low"])
#     data.set_value(i,"prev_prev_close",prev["Close"])

data["close_diff"] = abs(data["Close"] - data["prev_close"])
# data["close_diff"] = abs(data["Close"] / data["prev_close"])
data["open_diff"] = abs(data["Open"] - data["prev_open"])
# data["open_diff"] = abs(data["Open"] / data["prev_open"])
data["high_diff"] = abs(data["High"] - data["prev_high"])
# data["high_diff"] = abs(data["High"] / data["prev_high"])
data["low_diff"] = abs(data["Low"] - data["prev_low"])
# data["low_diff"] = abs(data["Low"] / data["prev_low"])

# data["prev_prev_close_diff"] = (data["Close"] - data["prev_prev_close"])
# data["prev_prev_raise_percentage"] = (data["Close"] - data["prev_prev_close"])/data["prev_prev_close"]
# data["prev_prev_open_diff"] = (data["Open"] - data["prev_prev_open"])
# data["prev_prev_high_diff"] = (data["High"] - data["prev_prev_high"])
# data["prev_prev_low_diff"] = (data["Low"] - data["prev_prev_low"])
# data["open_close_mean"] = (data["Open"] + data["Close"])/2


In [2429]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,prev_close,up/down,raise_percentage,spread,up_spread,down_spread,prev_open,prev_high,prev_low,close_diff,open_diff,high_diff,low_diff
0,1962-01-02,578.499734,578.499734,572.000241,572.000241,387200,2.300695,,False,,6.499493,0.0,6.499493,,,,,,,
1,1962-01-03,572.000241,576.999736,572.000241,576.999736,288000,2.320804,572.000241,True,0.00874,4.999495,4.999495,0.0,578.499734,578.499734,572.000241,4.999495,6.499493,1.499998,0.0
2,1962-01-04,576.999736,576.999736,570.999742,571.25026,256000,2.297679,576.999736,False,-0.009964,5.999994,0.0,5.999994,572.000241,576.999736,572.000241,5.749476,4.999495,0.0,1.000499
3,1962-01-05,570.500243,570.500243,558.999753,560.000253,363200,2.252429,571.25026,False,-0.019694,11.50049,0.0,11.50049,576.999736,576.999736,570.999742,11.250007,6.499493,6.499493,11.999989
4,1962-01-08,559.500003,559.500003,545.000267,549.500263,544000,2.210196,560.000253,False,-0.01875,14.499736,0.0,14.499736,570.500243,570.500243,558.999753,10.49999,11.00024,11.00024,13.999486


removing the first record because have no previuse record therefore can't know if up or down

In [2430]:
data = data[1:]
data.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Adj Close,prev_close,raise_percentage,spread,up_spread,down_spread,prev_open,prev_high,prev_low,close_diff,open_diff,high_diff,low_diff
count,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0,13731.0
mean,190.029059,191.625164,188.532009,190.054171,4888342.0,42.175604,190.084077,0.000131,3.093155,1.596105,1.49705,190.059539,191.655518,188.562043,2.016392,1.946078,1.744849,1.822445
std,132.133261,132.918323,131.463776,132.141526,4578848.0,51.413127,132.181493,0.019022,2.525031,1.927112,1.955795,132.174593,132.959075,131.504268,4.575595,4.471322,4.482632,4.527691
min,41.0,41.75,40.625,41.0,0.0,1.231153,41.0,-0.749178,0.0,0.0,0.0,41.0,41.75,40.625,0.0,0.0,0.0,0.0
25%,97.5,98.5,96.5,97.449997,1182200.0,5.943053,97.449997,-0.007978,1.5,0.375,0.269997,97.5,98.5,96.5,0.5,0.5,0.379997,0.400002
50%,128.0,129.125,127.139999,128.210007,4172100.0,16.207522,128.210007,0.0,2.375,1.0,0.875,128.0,129.125,127.139999,1.180008,1.125,1.0,1.0
75%,263.81255,266.0,262.0,263.999996,6966700.0,71.055749,263.999998,0.008337,3.875054,2.030003,1.999647,263.87502,266.0,262.0,2.499924,2.375046,2.0625,2.1875
max,649.000015,649.874802,645.500031,649.000015,69444700.0,197.047189,649.000015,0.131636,42.000031,28.500009,42.000031,649.000015,649.874802,645.500031,308.499985,309.000015,311.500015,312.999992


In [2431]:
MAX_WINDOW = 7

In [2432]:
def extract_features(items):
    return [[item[1], item[2], item[3], item[4],
            item[5], item[6], item[9], item[10],
            item[11], item[12], 
            item[16], item[17],
            item[18], item[19],
             1] 
            
            if item[8] 
            
            else 
           [item[1], item[2], item[3], item[4],
            item[5], item[6], item[9], item[10],
            item[11], item[12], 
            item[16], item[17],
            item[18], item[19],
             0] 
            
            for item in items]
                

# def extract_features(items):
#     return [[item[12],item[11],item[10],item[9], 1] if item[8] else [item[12],item[11],item[10],item[9], -1] for item in items]

In [2433]:
def extract_expected_result(item):
    return 1 if item[8] else 0

In [2434]:
def generate_input_and_outputs(data):
    step = 1
    inputs = []
    outputs = []
    for i in range(0, len(data) - MAX_WINDOW, step):
        inputs.append(extract_features(data.iloc[i:i + MAX_WINDOW].as_matrix()))
        outputs.append(extract_expected_result(data.iloc[i + MAX_WINDOW].as_matrix()))
    return inputs, outputs

In [2435]:
print "generating model input and outputs"
X, y = generate_input_and_outputs(data)
print "done generating input and outputs"

generating model input and outputs
done generating input and outputs


In [2436]:
y = to_categorical(y)

### split the data to train and test 

In [2437]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [2438]:
X_train,X_validation,y_train,y_validation = train_test_split(X_train,y_train,test_size=0.2)

## configure the models

In [2439]:
models = []

In [2440]:
layer_output_size1 = 5
layer_output_size2 = 5
number_of_features = 1
output_classes = len(y[0])
percentage_of_neurons_to_ignore = 0.2


model = Sequential()
model.add(LSTM(layer_output_size1, return_sequences=True, input_shape=(MAX_WINDOW, len(X[0][0]))))
model.add(Dropout(percentage_of_neurons_to_ignore))
model.add(LSTM(layer_output_size2, return_sequences=False))
model.add(Dropout(percentage_of_neurons_to_ignore))
model.add(Dense(output_classes))
model.add(Activation('softmax'))
model.alg_name = "lstm"
model.compile(loss='categorical_crossentropy',metrics=['accuracy'], optimizer='rmsprop')
models.append(model)

model = Sequential()
model.add(SimpleRNN(layer_output_size1, return_sequences=True, input_shape=(MAX_WINDOW, len(X[0][0]))))
model.add(Dropout(percentage_of_neurons_to_ignore))
model.add(SimpleRNN(layer_output_size2, return_sequences=False))
model.add(Dropout(percentage_of_neurons_to_ignore))
model.add(Dense(output_classes))
model.add(Activation('softmax'))
model.alg_name = "simpleRnn"
model.compile(loss='categorical_crossentropy',metrics=['accuracy'], optimizer='rmsprop')
models.append(model)

model = Sequential()
model.add(GRU(layer_output_size1, return_sequences=True, input_shape=(MAX_WINDOW, len(X[0][0]))))
model.add(Dropout(percentage_of_neurons_to_ignore))
model.add(GRU(layer_output_size2, return_sequences=False))
model.add(Dropout(percentage_of_neurons_to_ignore))
model.add(Dense(output_classes))
model.add(Activation('softmax'))
model.alg_name = "gru"
model.compile(loss='categorical_crossentropy',metrics=['accuracy'], optimizer='rmsprop')
models.append(model)

### training

In [2441]:
def trainModel(model):
    epochs = 1
    print "Training model %s"%(model.alg_name)
    model.fit(X_train, y_train, batch_size=128, nb_epoch=epochs,validation_data=(X_validation,y_validation))

### evaluation

In [2442]:
def createSplit(model):
    split_model = RandomForestClassifier()
    split_model.fit(model.predict(X_validation), y_validation)
    return split_model

In [2443]:
def probabilities_to_prediction(record):
    return [1,0] if record[0]>record[1] else [0,1]

In [2444]:
def evaluateModel(model):
    success, success2 = 0,0
    predicts = model.predict(X_test)
    split_model = createSplit(model)
    for index, record in enumerate(predicts):
        predicted = list(split_model.predict([np.array(record)])[0])
        predicted2 = probabilities_to_prediction(record)
        expected = y_test[index]
        if predicted[0] == expected[0]:
            success += 1
        if predicted2[0] == expected[0]:
            success2 += 1
    accuracy = float(success) / len(predicts)
    accuracy2 = float(success2) / len(predicts)
    print "The Accuracy for %s is: %s or %s" % (model.alg_name, accuracy, accuracy2)
    return accuracy2

In [2445]:
def train_and_evaluate(epochs):
    accuracies = {}
    for i in range(epochs):
        print "Epoch %s"%(i)
        for model in models:
            trainModel(model)
            acc = evaluateModel(model)
            if model.alg_name not in accuracies:
                accuracies[model.alg_name] = []
            accuracies[model.alg_name].append(acc)
    return accuracies

In [2450]:
accs = train_and_evaluate(20)

Epoch 0
Training model lstm
Train on 8783 samples, validate on 2196 samples
Epoch 1/1
The Accuracy for lstm is: 0.51621129326 or 0.51621129326
Training model simpleRnn
Train on 8783 samples, validate on 2196 samples
Epoch 1/1
The Accuracy for simpleRnn is: 0.51693989071 or 0.51621129326
Training model gru
Train on 8783 samples, validate on 2196 samples
Epoch 1/1
The Accuracy for gru is: 0.51693989071 or 0.51621129326
Epoch 1
Training model lstm
Train on 8783 samples, validate on 2196 samples
Epoch 1/1
The Accuracy for lstm is: 0.51621129326 or 0.51621129326
Training model simpleRnn
Train on 8783 samples, validate on 2196 samples
Epoch 1/1
The Accuracy for simpleRnn is: 0.51621129326 or 0.51621129326
Training model gru
Train on 8783 samples, validate on 2196 samples
Epoch 1/1
The Accuracy for gru is: 0.517304189435 or 0.51693989071
Epoch 2
Training model lstm
Train on 8783 samples, validate on 2196 samples
Epoch 1/1
The Accuracy for lstm is: 0.51621129326 or 0.51621129326
Training model

In [2451]:
for algo in accs:
    print "the accuracy for %s is %s"%(algo,sum(accs[algo])/len(accs[algo]))

the accuracy for lstm is 0.51621129326
the accuracy for gru is 0.517176684882
the accuracy for simpleRnn is 0.516265938069
