In [None]:
# CS 230 Project Milestone
# Dante Zakhidov, Abdulmalik Obaid, Scott Keene

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression
import math
from scipy import stats
%matplotlib inline
%pylab inline

# Type the filename of the bitcoin price data file (starting with /)
bitcoin_data_filename = r"C:\Users\Penguin\coinbaseUSD_1-min_data_2014-12-01_to_2018-01-08.csv"
# Any results you write to the current directory are saved as output.

# Import packages for keras

from keras import layers
from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D
from keras.models import Model, load_model
from keras.preprocessing import image
from keras.utils import layer_utils
from keras.utils.data_utils import get_file
from keras.applications.imagenet_utils import preprocess_input
import pydot
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
from keras.initializers import glorot_uniform
import scipy.misc
from matplotlib.pyplot import imshow
import keras.backend as K

# For reproducibility
seed = 7
numpy.random.seed(seed)

In [None]:
#Import Data
def import_data_file(filename):
    # Get raw data
    BTC_Price = pd.read_csv(bitcoin_data_filename)
    
    # Identify the Price variance for the each days
    # Variance = Close price minus the Open price 
    # Negative value indicate price has declined for that day and Positive value represent increase in price
    BTC_Price['Variance'] = ((BTC_Price["Close"] - BTC_Price["Open"])/BTC_Price["Close"])*100

    # Frequeny of change for a given day (High - Low)
    BTC_Price['Freq'] = ((BTC_Price["High"] - BTC_Price["Low"])/BTC_Price["High"])*100
    
    # Create binary classification
    BTC_Price['Up_Label'] = (BTC_Price["Close"] > BTC_Price["Open"])
    
    return BTC_Price

# Generate Minute Dataset, will compare performance of model based on daily data

# Generate minute by minute training and test set
def generate_min_dataset(dataset, num_train, num_dev):
    '''Takes the minute by minute data (dataset) and generates a text set X_train, Y_train
    Uses set number of train data (num_train) and test data (num_test)
    Dataset structure contains:
    Timestamp, Open, High, Low, Close, Volume_(BTC), Volume_Currency, Weighted_Price, Variance, Freq'''
    
    data = dataset.as_matrix()
    # Takes X data as (num_train x 10) matrix for inputs
    # Takes the Y data as the Up_Label of the following X dataset 
    X_train = data[1000000:1000000 + num_train, 1:9]
    Y_train = data[1000000 + 1:1000000 + num_train + 1, 10]
    X_dev = data[1570000 - num_dev:1570000, 1:9]
    Y_dev = data[1570000 - num_dev + 1:1570000 + 1, 10]
    
    return X_train, Y_train, X_dev, Y_dev

# Generate Daily Dataset to compare model performance to minute data
def generate_daily_dataset(dataset, num_days_train, num_days_dev):
    
    data = dataset.as_matrix()
    min_to_day = 1440
    X_train = np.zeros((num_days_train, 8))
    Y_train = np.zeros((num_days_train, 1))
    X_dev = np.zeros((num_days_train, 8))
    Y_dev = np.zeros((num_days_dev, 1))
    # Takes the mean of 1440 minutes to get the average price for that day. 
    for i in range(0, num_days_train):
        X_train[i,:] = np.mean(data[i*min_to_day:(i + 1)*min_to_day, 1:9], axis = 0)
        X_train[i,1] = data[i*min_to_day, 1]
        X_train[i,2] = np.max(data[i*min_to_day:(i + 1)*min_to_day, 2])
        X_train[i,3] = np.max(data[i*min_to_day:(i + 1)*min_to_day, 3])
        X_train[i,4] = data[i*min_to_day, 4]
    for i in range(num_days_train, num_days_train + num_days_dev):    
        X_dev[i-num_days_train,:] = np.mean(data[i*min_to_day:(i + 1)*min_to_day, 1:9], axis = 0)
        X_dev[i-num_days_train,1] = data[i*min_to_day, 1]
        X_dev[i-num_days_train,2] = np.max(data[i*min_to_day:(i + 1)*min_to_day, 2])
        X_dev[i-num_days_train,3] = np.max(data[i*min_to_day:(i + 1)*min_to_day, 3])
        X_dev[i-num_days_train,4] = data[i*min_to_day, 4]
    # For whether the price went up or down, new mean data is compared
    for j in range(0, num_days_train-1):
        Y_train[j,0] = (X_train[j,4] > X_train[j+1,4])
    for k in range(0, num_days_dev-1):
        Y_dev[k,0] = (X_dev[k,4] > X_dev[k+1,4])
    
    return X_train, Y_train, X_dev, Y_dev

# Normalize Data to avoid one variable containing too much weight, since the
# magnitude for all the variables changes drastically
def normalize(data):
    
    row_sums = data.sum(axis = 1)
    data = data/row_sums[:,np.newaxis]
    
    return data

#First Try Model

# Import required functions from Keras
from keras.models import Sequential
from keras.layers import Dense, Activation, BatchNormalization

# Get data file
BTC_Price = import_data_file(bitcoin_data_filename)
X, Y, X_dev, Y_dev = generate_min_dataset(BTC_Price, 10000, 10000) # generate minute dataset
# X, Y, X_dev, Y_dev = generate_daily_dataset(BTC_Price, 10000, 128) # generate daily dataset
X = normalize(X)
X_dev = normalize(X_dev)

# this is a 3 layer network, using binary cross entropy as the loss function and adam regularization. 
model = Sequential()
model.add(Dense(128, activation='relu', input_dim = size(X,1)))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='Adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model with batch sizes of 32 samples

history = model.fit(X, Y, epochs = 100, batch_size = 32, validation_data = (X_dev, Y_dev))


In [None]:
print(history.history.keys())

In [None]:
### Print final prediciton accuracy
preds = model.evaluate(X_dev, Y_dev, batch_size=128, verbose=1, sample_weight=None)
### END CODE HERE ###
print()
print ("Loss = " + str(preds[0]))
print ("Test Accuracy = " + str(preds[1]))

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

In [None]:
#RNN model to compare with simple 3 layer NN

# Import required functions from Keras
from keras.models import Sequential
from keras.layers import Dense, Activation, BatchNormalization, LSTM

# Get data file
BTC_Price = import_data_file(bitcoin_data_filename)
timesteps = 1440

X, Y, X_dev, Y_dev = generate_min_dataset(BTC_Price, 10000, 10000)
X = normalize(X)
X_dev = normalize(X_dev)
batch_size = 32

# Specifically trying Long-Short term memory (LSTM) model others have suggested works for stock data
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM

model = Sequential()
model.add(Embedding(8, output_dim=256))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

# Train the model with batch sizes of 32 samples
model.fit(X, Y, epochs = 20, batch_size = 32)
score = model.evaluate(X_dev, Y_dev, batch_size=100)
print(model.metrics_names + score)

In [None]:
#Calculating Accuracy of Lag Model to Test Polarity Baseline 
#Calculating 1 day lag shift
BTC_Price['shift'] = BTC_Price['Up_Label'].shift(1)  # new column of 
BTC_Price['lag_correct'] = np.where(BTC_Price['Up_Label'] == BTC_Price['shift'],1,0)
acc = BTC_Price['lag_correct'].sum()/BTC_Price['lag_correct'].size*100  #outputs accuracy

# Plotting Lag Model over 25 minutes to depict Lag Model
plt.plot(BTC_Price['Timestamp'],BTC_Price['Up_Label'],color = 'b', linewidth=1.0)
plt.plot(BTC_Price['Timestamp'],BTC_Price['shift'],color = 'g', linewidth=1.0)
plt.axis([1500000000, 1500001500,-0.1,1.1])
plt.ylabel('Polarity Index',fontsize=12)
plt.title('Lag Model',fontsize=16)
plt.show()
