In [1]:
from pandas_datareader.data import DataReader
from datetime import datetime
import os
import pandas as pd
import random
import numpy as np
from keras.models import Sequential
from keras.layers.recurrent import LSTM,GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

Using Theano backend.


In [2]:
random.seed(123)
np.random.seed(123)

loading or downloading the data

In [3]:
def get_data_if_not_exists(force=False):
    if os.path.exists("./data/ibm.csv") and not force:
        return pd.read_csv("./data/ibm.csv")
    else:
        if not os.path.exists("./data"):
            os.mkdir("data")
        ibm_data = DataReader('IBM', 'yahoo', datetime(1950, 1, 1), datetime.today())
        pd.DataFrame(ibm_data).to_csv("./data/ibm.csv")
        return pd.DataFrame(ibm_data).reset_index()

## exploring the data

In [4]:
print "loading the data"
data = get_data_if_not_exists(force=True)
print "done loading the data"

loading the data
done loading the data


In [5]:
print "data columns names: %s"%data.columns.values

data columns names: ['Date' 'Open' 'High' 'Low' 'Close' 'Volume' 'Adj Close']


In [6]:
print data.shape
data

(13729, 7)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,1962-01-02,578.499734,578.499734,572.000241,572.000241,387200,2.300695
1,1962-01-03,572.000241,576.999736,572.000241,576.999736,288000,2.320804
2,1962-01-04,576.999736,576.999736,570.999742,571.250260,256000,2.297679
3,1962-01-05,570.500243,570.500243,558.999753,560.000253,363200,2.252429
4,1962-01-08,559.500003,559.500003,545.000267,549.500263,544000,2.210196
5,1962-01-09,552.000010,563.000250,552.000010,555.999756,491200,2.236338
6,1962-01-10,557.000256,559.500003,557.000256,557.000256,299200,2.240362
7,1962-01-11,558.500254,563.000250,558.500254,563.000250,315200,2.264495
8,1962-01-12,563.999999,567.999744,563.999999,563.999999,435200,2.268517
9,1962-01-15,566.000247,567.750013,566.000247,566.499746,251200,2.278571


found out that Adj Close is not the last day close. it's the "true" close of the day.

## preparing the data

In [7]:
for i in range(1,len(data)):
    prev = data.iloc[i-1]
    data.set_value(i,"prev_close",prev["Close"])

In [8]:
data["up/down"] = (data["Close"] - data["prev_close"]) > 0

In [9]:
data["raise_percentage"] = (data["Close"] - data["prev_close"])/data["prev_close"]

In [10]:
data["spread"] = abs(data["High"]-data["Low"])

In [11]:
data["up_spread"] = abs(data["High"]-data["Open"])

In [12]:
data["down_spread"] = abs(data["Open"]-data["Low"])

removing the first record because have no previuse record therefore can't know if up or down

In [13]:
data = data[1:]
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,prev_close,up/down,raise_percentage,spread,up_spread,down_spread
1,1962-01-03,572.000241,576.999736,572.000241,576.999736,288000,2.320804,572.000241,True,0.008740,4.999495,4.999495,0.000000
2,1962-01-04,576.999736,576.999736,570.999742,571.250260,256000,2.297679,576.999736,False,-0.009964,5.999994,0.000000,5.999994
3,1962-01-05,570.500243,570.500243,558.999753,560.000253,363200,2.252429,571.250260,False,-0.019694,11.500490,0.000000,11.500490
4,1962-01-08,559.500003,559.500003,545.000267,549.500263,544000,2.210196,560.000253,False,-0.018750,14.499736,0.000000,14.499736
5,1962-01-09,552.000010,563.000250,552.000010,555.999756,491200,2.236338,549.500263,True,0.011828,11.000240,11.000240,0.000000
6,1962-01-10,557.000256,559.500003,557.000256,557.000256,299200,2.240362,555.999756,True,0.001799,2.499747,2.499747,0.000000
7,1962-01-11,558.500254,563.000250,558.500254,563.000250,315200,2.264495,557.000256,True,0.010772,4.499996,4.499996,0.000000
8,1962-01-12,563.999999,567.999744,563.999999,563.999999,435200,2.268517,563.000250,True,0.001776,3.999745,3.999745,0.000000
9,1962-01-15,566.000247,567.750013,566.000247,566.499746,251200,2.278571,563.999999,True,0.004432,1.749766,1.749766,0.000000
10,1962-01-16,566.000247,566.000247,560.499752,560.499752,251200,2.254438,566.499746,False,-0.010591,5.500495,0.000000,5.500495


In [14]:
data["up/down"].describe()

count     13728
unique        2
top       False
freq       7043
Name: up/down, dtype: object

In [15]:
MAX_WINDOW = 10

In [16]:
def extract_features(items):
    return [[item[12],item[11],item[10],item[9], 1] if item[8] else [item[12],item[11],item[10],item[9], -1] for item in items]

In [17]:
def extract_expected_result(item):
    return [0, 1] if item[8] else [1, 0]

In [18]:
def generate_input_and_outputs(data):
    step = 1
    inputs = []
    outputs = []
    for i in range(0, len(data) - MAX_WINDOW, step):
        inputs.append(extract_features(data.iloc[i:i + MAX_WINDOW].as_matrix()))
        outputs.append(extract_expected_result(data.iloc[i + MAX_WINDOW].as_matrix()))
    return inputs, outputs

In [19]:
print "generating model input and outputs"
X, y = generate_input_and_outputs(data)
print "done generating input and outputs"

generating model input and outputs
done generating input and outputs


### split the data to train and test 

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [21]:
X_train,X_validation,y_train,y_validation = train_test_split(X_train,y_train,test_size=0.2)

## configure the models

In [22]:
models = []

In [23]:
layer_output_size1 = 512
layer_output_size2 = 512
number_of_features = 5
output_classes = len(y[0])
percentage_of_neurons_to_ignore = 0.2

"""
model = Sequential()
model.add(LSTM(layer_output_size1, return_sequences=True, input_shape=(MAX_WINDOW, len(X[0][0]))))
model.add(Dropout(percentage_of_neurons_to_ignore))
model.add(LSTM(layer_output_size2, return_sequences=False))
model.add(Dropout(percentage_of_neurons_to_ignore))
model.add(Dense(output_classes))
model.add(Activation('softmax'))
model.alg_name = "lstm"
model.compile(loss='categorical_crossentropy',metrics=['accuracy'], optimizer='rmsprop')
models.append(model)
"""

model = Sequential()
model.add(GRU(layer_output_size1, return_sequences=True, input_shape=(MAX_WINDOW, len(X[0][0]))))
model.add(Dropout(percentage_of_neurons_to_ignore))
model.add(GRU(layer_output_size2, return_sequences=False))
model.add(Dropout(percentage_of_neurons_to_ignore))
model.add(Dense(output_classes))
model.add(Activation('softmax'))
model.alg_name = "gru"
model.compile(loss='categorical_crossentropy',metrics=['accuracy'], optimizer='rmsprop')
models.append(model)

### training

In [24]:
def trainModel(model):
    epochs = 1
    print "Training model %s"%(model.alg_name)
    model.fit(X_train, y_train, batch_size=128, nb_epoch=epochs,validation_data=(X_validation,y_validation))

### evaluation

In [25]:
def createSplit(model):
    split_model = RandomForestClassifier()
    split_model.fit(model.predict(X_validation), y_validation)
    return split_model

In [26]:
def probabilities_to_prediction(record):
    return [1,0] if record[0]>record[1] else [0,1]

In [29]:
def evaluateModel(model):
    success = 0
    success2 = 0
    predicts = model.predict(X_test)
    split_model = createSplit(model)
    for index, record in enumerate(predicts):
        predicted = list(split_model.predict([np.array(record)])[0])
        predicted2 = probabilities_to_prediction(record)
        expected = y_test[index]
        # print "record",record
        # print "naive model result",predicted
        # print "linear model result",
        # print "expected",expected
        # print "predicted: %s\t expected: %s match: %s" % (predicted, expected, predicted == expected)
        if predicted == expected:
            success += 1
        if predicted2 == expected:
            success2 += 1
    accuracy = float(success) / len(predicts)
    accuracy2 = float(success2) / len(predicts)
    print "The Accuracy for %s is: %s or %s" % (model.alg_name, accuracy, accuracy2)
    return accuracy2

In [30]:
def train_and_evaluate(epochs):
    accuracies = {}
    for i in range(epochs):
        print "Epoch %s"%(i)
        for model in models:
            trainModel(model)
            acc = evaluateModel(model)
            if model.alg_name not in accuracies:
                accuracies[model.alg_name] = []
            accuracies[model.alg_name].append(acc)
    return accuracies

In [31]:
accs = train_and_evaluate(30)

Epoch 0
Training model gru
Train on 8779 samples, validate on 2195 samples
Epoch 1/1


ValueError: Number of features of the model must  match the input. Model n_features is 2 and  input n_features is 1 

In [None]:
for algo in accs:
    print "the accuracy for %s is %s"%(algo,sum(accs[algo])/len(accs[algo]))