In [None]:
# CS 230 Final Project
# Scott Keene, Dante Zakhidov, Abdulmalik Obaid
# Bitcoin Price Prediction

# Type the directory that the data files are located
directory = "/Users/scottkeene/Documents/Python/CS 230 Project/input"
# Type the filename of the bitcoin price data file (starting with /)
bitcoin_data_filename = "/coinbaseUSD_1-min_data_2014-12-01_to_2018-01-08 2.csv"
# Type the filename of the Google Trend data file (starting with /)
google_data_filename = "/Google Search Frequency.csv"

# Import these at the beginning of every session!
%matplotlib inline
%pylab inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression
import math
from scipy import stats

from subprocess import check_output
print(check_output(["ls", directory]).decode("utf8"))

# Import packages for keras
import numpy as np
from keras import layers
from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D
from keras.models import Model, load_model
from keras.preprocessing import image
from keras.utils import layer_utils
from keras.utils.data_utils import get_file
from keras.applications.imagenet_utils import preprocess_input
import pydot
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
from keras.initializers import glorot_uniform
import scipy.misc
from matplotlib.pyplot import imshow
%matplotlib inline

import keras.backend as K

# Any results you write to the current directory are saved as output.


In [None]:
# Read in bitcoin pricing data from file
def import_data_file(directory, bitcoin_data_filename):
    # Get raw data
    BTC_Price = pd.read_csv(directory + bitcoin_data_filename)
    
    # Identify the Price variance for the each days
    # Variance = Close price minus the Open price 
    # Negative value indicate price has declined for that day and Positive value represent increase in price
    BTC_Price['Variance'] = ((BTC_Price["Close"] - BTC_Price["Open"])/BTC_Price["Close"])*100

    # Frequeny of change for a given day (High - Low)
    BTC_Price['Freq'] = ((BTC_Price["High"] - BTC_Price["Low"])/BTC_Price["High"])*100
    
    # Create binary classification
    BTC_Price['Up_Label'] = (BTC_Price["Close"] > BTC_Price["Open"])
    
    return BTC_Price

BTC_Price.head() # verify data looks ok

In [None]:
# Convert the minute data to hour by hour data
def generate_hourly_dataset(dataset, num_hours):
    
    data = dataset.as_matrix()
    min_to_hour = 60
    X = np.zeros((num_hours, 5))
    initial_timestamp = data[0,0]
    for i in range(0, num_hours):
        X[i,0] = initial_timestamp + min_to_hour*60*i
        X[i,1] = data[i*min_to_hour, 1] #Open
        X[i,2] = np.max(data[i*min_to_hour:(i + 1)*min_to_hour, 2]) #High
        X[i,3] = np.min(data[i*min_to_hour:(i + 1)*min_to_hour, 3]) #Low
        X[i,4] = data[(i+1)*min_to_hour, 1] #Close
    X_data = pd.DataFrame(X, columns = ["Timestamp", "Open", "High", "Low", "Close"])
    
    X_data['Variance'] = ((X_data["Close"] - X_data["Open"])/X_data["Close"])*100

    # Frequeny of change for a given day (High - Low)
    X_data['Freq'] = ((X_data["High"] - X_data["Low"])/X_data["High"])*100
    
    # Create binary classification
    X_data['Up_Label'] = (X_data["Close"] > X_data["Open"])
    
    return X_data

In [None]:
# Convert the minute data to day by day data
def generate_daily_dataset(dataset, num_days):
    
    data = dataset.as_matrix()
    
    min_to_day = 1440
    X = np.zeros((num_days, 5))
    initial_timestamp = data[0,0]
    for i in range(0, num_days):
        X[i,0] = data[i*min_to_day, 0]
        X[i,1] = data[i*min_to_day, 1] #Open
        X[i,2] = np.max(data[i*min_to_day:(i + 1)*min_to_day, 2]) #High
        X[i,3] = np.min(data[i*min_to_day:(i + 1)*min_to_day, 3]) #Low
        X[i,4] = data[(i+1)*min_to_day, 1] #Close
    X_data = pd.DataFrame(X, columns = ["Timestamp", "Open", "High", "Low", "Close"])
    
    X_data['Variance'] = ((X_data["Close"] - X_data["Open"])/X_data["Close"])*100

    # Frequeny of change for a given day (High - Low)
    X_data['Freq'] = ((X_data["High"] - X_data["Low"])/X_data["High"])*100
    
    search_history = pd.read_csv(directory + google_data_filename)
    
    return X_data



In [None]:
# no longer used, but you can use it to normalize the data. Used for simple NN
def normalize(data):
    
    column_sum = data.sum(axis = 0)
    data = data/column_sum

    return data

In [None]:
## Baseline simple neural network ## 2 layers, for binary classification of if price will go up or down
# Import required functions from Keras
from keras.models import Sequential
from keras.layers import Dense, Activation, BatchNormalization, LSTM

# Get data file
BTC_Price = import_data_file(directory, bitcoin_data_filename)
timesteps = 10

X, Y, X_dev, Y_dev = generate_min_dataset(BTC_Price, 100000, 1000, start_index = 40000) # change generate_min_dataset to generate_day_dataset to compare different sampling frequencies
X = normalize(X)
X_dev = normalize(X_dev)
batch_size = 32

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM

model = Sequential()
model.add(Embedding(8, output_dim=256))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [None]:
# Train the model with batch sizes of 32 samples
model.fit(X, Y, epochs = 2, batch_size = 100)
score = model.evaluate(X_dev, Y_dev, batch_size=100)
print(model.metrics_names + score)

In [None]:
# Now, we want to test the performance of this vs. a simple lag model (no NN)
#Creating a new Panda Database with just timestamp and up_label 
Baseline_Database_min = BTC_Price[['Timestamp', 'Open', 'Close','Up_Label']].copy()
Baseline_Database_day = X_day[['Timestamp','Open','Close','Up_Label']].copy()
Baseline_Database_min.head()

# plotting bitcoin price vs day
plt.plot(BTC_Price['Timestamp'],BTC_Price['Close'],linewidth=1.0)
plt.axis([1509494400, 1511913599,5000,11000])
plt.ylabel('Closing Price($)',fontsize=12)
plt.show()

In [None]:
# 1 Minute LAG Model
#Calculating Accuracy of Lag Model to Test Polarity Baseline of Minute Database
#Calculating 1 day lag shift
Baseline_Database_min['shift'] = Baseline_Database_min['Up_Label'].shift(1) #new column with values of up_label shifted by 1 
Baseline_Database_min['lag_correct'] = np.where(Baseline_Database_min['Up_Label'] == Baseline_Database_min['shift'],1,0)
acc_min = Baseline_Database_min['lag_correct'].sum()/Baseline_Database_min['lag_correct'].size*100  #outputs accuracy
acc_lag_min = np.repeat(acc,15)

# Plotting Lag Model over 25 minutes to depict Lag Model
plt.plot(Baseline_Database_min['Timestamp'],Baseline_Database_min['Up_Label'],color = 'b', linewidth=1.0)
plt.plot(Baseline_Database_min['Timestamp'],Baseline_Database_min['shift'],color = 'g', linewidth=1.0)
plt.axis([1487512980, 1487513980,-0.1,1.1]) #Day
plt.axis()
plt.ylabel('Polarity Index',fontsize=12)
plt.title('Lag Model',fontsize=16)
plt.show()
print(acc_min)

Baseline_Database_min.head()

In [None]:
# 1 DAY Lag Model
# Calculating Accuracy of Lag Model to Test Polarity Baseline of Day Database
# Will compare to NN model and exponentially weighted moving average
#Calculating 1 day lag shift (NO BINNING)
Baseline_Database_day['shift'] = Baseline_Database_day['Up_Label'].shift(1) #new column with values of up_label shifted by 1 
Baseline_Database_day['lag_correct'] = np.where(Baseline_Database_day['Up_Label'] == Baseline_Database_day['shift'],1,0)
acc_day = Baseline_Database_day['lag_correct'].sum()/Baseline_Database_day['lag_correct'].size*100  #outputs accuracy
#acc_lag_day = np.repeat(acc,15)

# Plotting Lag Model over 25 minutes to depict Lag Model
plt.plot(Baseline_Database_day['Timestamp'],Baseline_Database_day['Up_Label'],color = 'b', linewidth=1.0)
plt.plot(Baseline_Database_day['Timestamp'],Baseline_Database_day['shift'],color = 'g', linewidth=1.0)
plt.axis([1487512980, 1488512980,-0.1,1.1]) #Day
plt.axis()
plt.ylabel('Polarity Index',fontsize=12)
plt.title('One Day Lag Model',fontsize=16)
plt.show()
print(acc_day)

In [None]:
# Calculating the Exponentially Weighted Moving Average. 
# For Min dataset, binary classification of if price went up/down
acc_ewma_min = []
for t in range(1,16):
    Baseline_Database_min['EWMA'] = pd.ewma(Baseline_Database_min['Up_Label'], span=t)
    Baseline_Database_min['EWMA'] = np.where(Baseline_Database_min['EWMA'] >= 0.5,1,0)
    Baseline_Database_min['EWMA'] = Baseline_Database_min['EWMA'].shift(1)
    Baseline_Database_min['EWMA_correct'] = np.where(Baseline_Database_min['Up_Label'] == Baseline_Database_min['EWMA'],1,0)
    acc_t = Baseline_Database_min['EWMA_correct'].sum()/Baseline_Database_min['EWMA_correct'].size*100
    acc_ewma_min.append(acc_t)

# plot performance of exponentially weighted moving average for min dataset
print(acc_ewma_min)
plt.plot(range(1,16),acc_ewma_min, color = 'b',linewidth = 1.0)
plt.show()

In [None]:
# Calculating the Exponentially Weighted Moving Average
# For Day dataset, binary classification of if price went up/down
acc_ewma_day_nobin = []
for t in range(1,150):
    Baseline_Database_day['EWMA'] = pd.ewma(Baseline_Database_day['Up_Label'], span=t)
    Baseline_Database_day['EWMA'] = np.where(Baseline_Database_day['EWMA'] >= 0.5,1,0)
    Baseline_Database_day['EWMA'] = Baseline_Database_day['EWMA'].shift(1)
    Baseline_Database_day['EWMA_correct'] = np.where(Baseline_Database_day['Up_Label'] == Baseline_Database_day['EWMA'],1,0)
    acc_t = Baseline_Database_day['EWMA_correct'].sum()/Baseline_Database_day['EWMA_correct'].size*100
    acc_ewma_day_nobin.append(acc_t)

# plot performance of exponentially weighted moving average for daily dataset

print(acc_ewma_day_nobin)
plt.plot(range(1,150),acc_ewma_day_nobin, color = 'b',linewidth = 1.0)
plt.xlabel('Number of Days Considered',fontsize=12)
plt.ylabel('Accuracy',fontsize=12)
plt.show()

In [None]:
# For Min dataset, binnined labelling of the data to see how much the price fluctuated
# Calculating the Exponentially Weighted Moving Average
acc_ewma_min = []
delta = Baseline_Database_min["Close"] - Baseline_Database_min["Open"]
Baseline_Database_min['percent_delta'] = np.divide(delta, Baseline_Database_min["Open"])
Y_data_min = generate_binned_labels_custom(Baseline_Database_min['percent_delta'], binner_settings = [-0.025, -0.01, -0.003, 0.003, 0.01, 0.025])
for t in range(1,2):
    percent_delta = pd.ewma(Baseline_Database_min['percent_delta'], span=t).shift(1)
    Y_data_t = generate_binned_labels_custom(percent_delta, binner_settings = [-0.025, -0.01, -0.003, 0.003, 0.01, 0.025])
    Baseline_Database_min['EWMA_correct'] = np.where(np.sum(np.abs(np.subtract(Y_data_min,Y_data_t)),axis=1)==0,1,0)
    acc_t = Baseline_Database_day['EWMA_correct'].sum()/Baseline_Database_day['EWMA_correct'].size*100
    acc_ewma_min.append(acc_t)

print(acc_ewma_min)
plt.plot(range(1,2),acc_ewma_min, color = 'b',linewidth = 1.0)
plt.show()

Y_data_min.to_csv("C:\\Users\\Dante\\Documents\\Stanford\\Coursework\\Winter 2018\\CS230\\Project\\Y_data.csv")
Y_data_min = pd.read_csv("C:\\Users\\Dante\\Documents\\Stanford\\Coursework\\Winter 2018\\CS230\\Project\\Y_data.csv")
names = Y_data_min.columns.values
Y_data_min = Y_data_min.drop(['Unnamed: 0'], axis =1)
binner_settings = [-0.025, -0.01, -0.003, 0.003, 0.01, 0.025]
plt.bar(linspace(0, len(binner_settings), len(binner_settings)+1), np.sum(Y_data_min, axis = 0)/np.sum(np.sum(Y_data_min, axis = 0)))

In [None]:
#For Day dataset, binnined labelling of the data to see how much the price fluctuated
#Calculating the Exponentially Weighted Moving Average
acc_ewma_day = []
delta = Baseline_Database_day["Close"] - Baseline_Database_day["Open"]
Baseline_Database_day['percent_delta'] = np.divide(delta, Baseline_Database_day["Open"])
Y_data = generate_binned_labels_custom(Baseline_Database_day['percent_delta'], binner_settings = [-0.025, -0.01, -0.003, 0.003, 0.01, 0.025])
for t in range(1,16):
    percent_delta = pd.ewma(Baseline_Database_day['percent_delta'], span=t).shift(1)
    Y_data_t = generate_binned_labels_custom(percent_delta, binner_settings = [-0.025, -0.01, -0.003, 0.003, 0.01, 0.025])
    Baseline_Database_day['EWMA_correct'] = np.where(np.sum(np.abs(np.subtract(Y_data,Y_data_t)),axis=1)==0,1,0)
    acc_t = Baseline_Database_day['EWMA_correct'].sum()/Baseline_Database_day['EWMA_correct'].size*100
    acc_ewma_day.append(acc_t)

acc_lag_day = np.repeat(acc_ewma_day[0],15)    
print(acc_ewma_day, acc_lag_day)
plt.plot(range(1,16),acc_ewma_day, color = 'b',linewidth = 2.0)
plt.xlabel('Number of Days Considered',fontsize=12)
plt.ylabel('Accuracy',fontsize=12)
#plt.title('Baseline: Exponentially Weighted Moving Average',fontsize=16)
plt.show()

In [None]:
#Plotting accuracy vs. length of weighted average
t = np.arange(1,16)
plt.plot(t,acc_ewma_min, color = 'b',linewidth = 2.0)
plt.plot(t,acc_lag_min, color = 'g', linewidth = 2.0)
plt.plot
plt.xlabel('Length of Span',fontsize=12)
plt.ylabel('Accuracy',fontsize=12)
plt.title('Exponentially Weighted Moving Average',fontsize=16)
plt.show()

In [None]:
# With a good understanding of the baseline performance, back to NN. 
# Generating binned data, to predict the amount price fluctuations will be
def generate_binned_labels_V2(X, number_of_bins, max_bin):
    
    delta = X["Close"] - X["Open"]
    percent_delta = np.divide(delta, X["Open"])
    Y = np.zeros((len(X["Open"]), number_of_bins))
    bin_range = 2*max_bin/(number_of_bins-2)
    columns = []
    ## Generate Y as a one hot vector
    for i in range(0, len(X["Open"])):
        if percent_delta[i] < -max_bin:
            Y[i, 0] = 1
        elif percent_delta[i] >= max_bin:
            Y[i, number_of_bins-1] = 1
        else:    
            for j in range(1, number_of_bins-1):
                if percent_delta[i] >= -max_bin + bin_range*(j-1) and percent_delta[i] < -max_bin + bin_range*j:
                    Y[i, j] = 1
    columns.append("<" + str(-max_bin) + "%")            
    for k in range(1, number_of_bins-1):
        columns.append(str(-max_bin + bin_range*(k-1)) + "% to " + str(-max_bin + bin_range*k) + "%")
    columns.append(">" + str(max_bin) + "%")
    Y_data = pd.DataFrame(Y, columns = columns)    
    return Y_data

In [None]:
## 4 layer NN with softmax activation, used for 'binned' data to predict how much the price will fluctuate
def price_data_simple_model(timesteps, input_dim, n_a, number_of_bins):
    # Import required functions from Keras
    from keras.models import Sequential
    from keras.layers import Dense, Activation, BatchNormalization, LSTM
    
    model = Sequential()
    model.add(LSTM(n_a, return_sequences = True, input_shape = (timesteps, input_dim)))
    model.add(LSTM(n_a, return_sequences = True))
    model.add(LSTM(n_a))
    model.add(Dense(number_of_bins, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'rmsprop', metrics = ['accuracy'])
    
    return model


In [None]:
# Get data file
BTC_Price = import_data_file(directory, bitcoin_data_filename)

X_daily = generate_daily_dataset(BTC_Price, 1093)
Y_daily = generate_binned_labels_custom(X_daily)

X = X_daily.as_matrix()
Y = Y_daily.as_matrix()

Y_day.head() # verify data looks ok

# Run model
timesteps = 20
X = rearrange_training_data(X[:,1:6], timesteps)
X_train = X[0:999,:,:]
X_dev = X[1000:1049,:,:]

Y_train = Y[1+timesteps:1000+timesteps,:]
Y_dev = Y[1001+timesteps:1050+timesteps,:]
model = price_data_model(timesteps, 6, 32, 6)
model.summary()
model.fit(X_train, Y_train, epochs = 10, validation_data=(X_dev, Y_dev))

In [None]:
# plot distrubution of price fluctuations
plt.bar([1, 2, 3, -1, -2, -3], Y.sum(axis = 0))

In [None]:
## Inputs ## Parameters to tune. These work reasonably well
timesteps = 8
bins = 5
max_bin = 0.1
n_a = 64
X_width = 6
batch_size = 32

X_daily = generate_daily_dataset(BTC_Price, 1093)
Y_daily = generate_binned_labels_V2(X_daily, bins, max_bin)

X = X_daily.as_matrix()
Y = Y_daily.as_matrix()

m = int((floor(1000/batch_size))*batch_size)
n = int((floor(93/batch_size))*batch_size)

X = rearrange_training_data(X[:,1:X_width], timesteps)
X_train = X[0:m,:,:]
X_dev = X[m+1:m+n+1,:,:]

Y_train = Y[1+timesteps:1+m+timesteps,:]
Y_dev = Y[m+2+timesteps:m+2+n+timesteps,:]

model = price_data_binned_model(timesteps, X_width, n_a, bins, batch_size)
model.summary()
model.fit(X_train, Y_train, epochs = 100, batch_size = batch_size, validation_data=(X_dev, Y_dev))


In [None]:
## Define Model
#  Add more layers to the model
def price_data_binned_model(timesteps, input_dim, n_a, number_of_bins, batch_size, loss):
    # Import required functions from Keras
    from keras.models import Sequential
    from keras.layers import Dense, Activation, BatchNormalization, LSTM
    
    model = Sequential()
    model.add(BatchNormalization(batch_input_shape = (batch_size, timesteps, input_dim)))
    model.add(LSTM(n_a, return_sequences = True, stateful = True))
    model.add(BatchNormalization())
    model.add(LSTM(n_a, return_sequences = True))
    model.add(BatchNormalization())
    model.add(LSTM(n_a))
    model.add(BatchNormalization())
    model.add(Dense(number_of_bins, activation = 'softmax'))
    model.compile(loss = loss, optimizer = 'rmsprop', metrics = ['accuracy'])
    
    return model

In [None]:
# generate binned data to predict amount of price fluctuations. This was to test the ability to predict large fluctuations
def generate_binned_labels_log_spacing(X, number_of_bins, max_bin, bin_range):
    
    delta = X["Close"] - X["Open"]
    percent_delta = np.divide(delta, X["Open"])
    Y = np.zeros((len(X["Open"]), number_of_bins))
    bin_range = 2*max_bin/(number_of_bins-2)
    columns = []
    spacing = np.logspace(log10(max_bin)-bin_range, log10(max_bin), number_of_bins/2-1)
    spacing = (np.append(np.append(spacing[::-1], 0),(-1*spacing)))[::-1]
    
    ## Generate Y as a one hot vector
    for i in range(0, len(X["Open"])):
        if percent_delta[i] < -max_bin:
            Y[i, 0] = 1
        elif percent_delta[i] >= max_bin:
            Y[i, number_of_bins-1] = 1
        else:    
            for j in range(1, number_of_bins-1):
                if percent_delta[i] >= spacing[j-1] and percent_delta[i] < spacing[j]:
                    Y[i, j] = 1
    columns.append("<" + str(-max_bin) + "%")            
    for k in range(1, number_of_bins-1):
        columns.append(str(spacing[k-1]) + "% to " + str(spacing[k]) + "%")
    columns.append(">" + str(max_bin) + "%")
    Y_data = pd.DataFrame(Y, columns = columns)    
    return Y_data

In [None]:
# generate binned data. This is the one we stuck with that works best. This distrubution is even across the last 3 years of bitcon pricing data
def generate_binned_labels_custom(percent_delta, binner_settings = [-0.025, -0.01, -0.003, 0.003, 0.01, 0.025]):
    
    number_of_bins = len(binner_settings)+1
    Y = np.zeros((len(percent_delta), number_of_bins))
    columns = []
    
    ## Generate Y as a one hot vector
    for i in range(0, len(percent_delta)):
        if percent_delta[i] < binner_settings[0]:
            Y[i, 0] = 1
        elif percent_delta[i] >= binner_settings[number_of_bins-2]:
            Y[i, number_of_bins-1] = 1
        else:    
            for j in range(1, number_of_bins-1):
                if percent_delta[i] >= binner_settings[j-1] and percent_delta[i] < binner_settings[j]:
                    Y[i, j] = 1
    columns.append("<" + str(binner_settings[0]) + "%")            
    for k in range(1, number_of_bins-1):
        columns.append(str(binner_settings[k-1]) + "% to " + str(binner_settings[k]) + "%")
    columns.append(">" + str(binner_settings[number_of_bins-2]) + "%")
    Y_data = pd.DataFrame(Y, columns = columns)    
    return Y_data

## 

In [None]:
# This cell runs the extended model with the daily dataset, where fluctuations are binned based on amount
# Overfits the data! Doesn't generalize to dev set
## Inputs ##
timesteps = 4
n_a = 64
X_width = 6
batch_size = 32
#bin settings
binner_settings = [-0.025, -0.01, -0.003, 0.003, 0.01, 0.025]

BTC_Price = import_data_file(bitcoin_data_filename)

X_daily = generate_daily_dataset(BTC_Price, 1093)
delta = X_daily["Close"] - X_daily["Open"]
percent_delta = np.divide(delta, X_daily["Open"])
Y_daily = generate_binned_labels_custom(percent_delta, binner_settings)

X = X_daily.as_matrix()
Y = Y_daily.as_matrix()

m = int((floor(800/batch_size))*batch_size)
n = int((floor(200/batch_size))*batch_size)

X = rearrange_training_data(X[:,1:X_width], timesteps)
X_train = X[0:m,:,:]
X_dev = X[m+1:m+n+1,:,:]

Y_train = Y[1+timesteps:1+m+timesteps,:]
Y_dev = Y[m+2+timesteps:m+2+n+timesteps,:]
print(Y_train)
plt.bar(linspace(0, len(binner_settings), len(binner_settings)+1), np.sum(Y, axis = 0)/np.sum(np.sum(Y, axis = 0)))

model = price_data_binned_model(timesteps, X_width, n_a, len(binner_settings)+1, batch_size)
model.summary()
#model.fit(X_train, Y_train, epochs = 500, batch_size = batch_size, validation_data=(X_dev, Y_dev))


In [None]:
# This cell runs the extended model with the daily dataset with up/down labels for price fluctuation (binary)
## Inputs ##
timesteps = 4
n_a = 64
X_width = 6
batch_size = 32
loss = "binary_crossentropy"

BTC_Price = import_data_file(directory, bitcoin_data_filename)

X_daily = generate_daily_dataset(BTC_Price, 1093)
Y_daily = (X_daily["Close"] > X_daily["Open"])

X = X_daily.as_matrix()
Y = Y_daily.as_matrix()

m = int((floor(800/batch_size))*batch_size)
n = int((floor(200/batch_size))*batch_size)

X = rearrange_training_data(X[:,1:X_width], timesteps)
X_train = X[0:m,:,:]
X_dev = X[m+1:m+n+1,:,:]

Y_train = Y[1+timesteps:1+m+timesteps]
Y_dev = Y[m+2+timesteps:m+2+n+timesteps]
print(Y_train)


model = price_data_binned_model(timesteps, X_width, n_a, 1, batch_size, loss)
model.summary()
training = model.fit(X_train, Y_train, epochs = 500, batch_size = batch_size, validation_data=(X_dev, Y_dev))

generate_plots(training)


In [None]:
# This cell runs the extended model with the hourly dataset with binned labels
## Inputs ##
timesteps = 4
n_a = 64
X_width = 6
batch_size = 32
binner_settings = [-0.025, -0.01, -0.003, 0.003, 0.01, 0.025]
loss = "categorical_crossentropy"

BTC_Price = import_data_file(directory, bitcoin_data_filename)

X_hourly = generate_hourly_dataset(BTC_Price, 26232)
delta = X_hourly["Close"] - X_hourly["Open"]
percent_delta = np.divide(delta, X_hourly["Open"])
Y_hourly = generate_binned_labels_custom(percent_delta, binner_settings)

X = X_hourly.as_matrix()
Y = Y_hourly.as_matrix()

m = int((floor(20000/batch_size))*batch_size)
n = int((floor(6000/batch_size))*batch_size)

X = rearrange_training_data(X[:,1:X_width], timesteps)
X_train = X[0:m,:,:]
X_dev = X[m+1:m+n+1,:,:]

Y_train = Y[1+timesteps:1+m+timesteps,:]
Y_dev = Y[m+2+timesteps:m+2+n+timesteps,:]

plt.bar(linspace(0, len(binner_settings), len(binner_settings)+1), np.sum(Y, axis = 0)/np.sum(np.sum(Y, axis = 0)))
plt.ylabel('Frequency (%)')
plt.xlabel('Bin Number')
plt.show()


model = price_data_binned_model_w_dropout(timesteps, X_width, n_a, len(binner_settings)+1, batch_size, loss, 0.5)
model.summary()
training = model.fit(X_train, Y_train, epochs = 500, batch_size = batch_size, validation_data=(X_dev, Y_dev))
generate_plots(training)

In [None]:
# plots to visualize model performance
def generate_plots(training):
    plt.plot(training.history['loss'])
    plt.plot(training.history['val_loss'])
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Dev'], loc='upper right')
    plt.show()
    plt.plot(training.history['acc'])
    plt.plot(training.history['val_acc'])
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Dev'], loc='upper right')
    plt.show()

In [None]:
generate_plots(training)

In [None]:
## Define Model, wanted to test deeper network
# Add more layers to the model
def price_data_binned_model_w_dropout(timesteps, input_dim, n_a, number_of_bins, batch_size, loss, drop_rate):
    # Import required functions from Keras
    from keras.models import Sequential
    from keras.layers import Dense, Activation, BatchNormalization, LSTM, Dropout
    
    model = Sequential()
    model.add(BatchNormalization(batch_input_shape = (batch_size, timesteps, input_dim)))
    model.add(LSTM(n_a, return_sequences = True, stateful = True, recurrent_dropout = drop_rate))
    model.add(BatchNormalization())
    model.add(LSTM(n_a, return_sequences = True, recurrent_dropout = drop_rate))
    model.add(BatchNormalization())
    model.add(LSTM(n_a, recurrent_dropout = drop_rate))
    model.add(BatchNormalization())
    model.add(Dense(number_of_bins, activation = 'softmax'))
    model.compile(loss = loss, optimizer = 'rmsprop', metrics = ['accuracy'])
    
    return model

In [None]:
## Define Model
#  Add more layers to the model
def price_data_binned_model_w_dropout_simple(timesteps, input_dim, n_a, number_of_bins, batch_size, loss, drop_rate):
    # Import required functions from Keras
    from keras.models import Sequential
    from keras.layers import Dense, Activation, BatchNormalization, LSTM, Dropout
    
    model = Sequential()
    model.add(BatchNormalization(batch_input_shape = (batch_size, timesteps, input_dim)))
    model.add(LSTM(n_a, return_sequences = False, stateful = True))
    model.add(Dropout(drop_rate))
    model.add(BatchNormalization())
    model.add(Dense(n_a*2, activation = 'relu'))
    model.add(Dense(n_a, activation = 'relu'))
    model.add(Dense(number_of_bins, activation = 'softmax'))
    model.compile(loss = loss, optimizer = 'Adam', metrics = ['accuracy'])
    
    return model

In [None]:
# load in Google Trend data
search_history = pd.read_csv(directory + google_data_filename) 

In [None]:
def rearrange_training_data(X, timesteps, X_width):
    
    days = len(X)
    X_rearranged = zeros((days-timesteps, timesteps, X_width))
    for d in range(0, days-timesteps):
        X_rearranged[d, 0:timesteps, 0:X_width] = X[d:d+timesteps, 1:X_width+1]

    return X_rearranged

In [None]:
# This cell runs the extended model with the daily dataset with binned labels and google search data
# To test if feeding the network google search data would improve dev performance
## Inputs ##
timesteps = 4
n_a = 64
X_width = 7
batch_size = 32
binner_settings = [-0.025, -0.01, -0.003, 0.003, 0.01, 0.025]
loss = "categorical_crossentropy"

BTC_Price = import_data_file(directory, bitcoin_data_filename)

X_daily = generate_daily_dataset(BTC_Price, 1093)
delta = X_daily["Close"] - X_daily["Open"]
percent_delta = np.divide(delta, X_daily["Open"])
Y_daily = generate_binned_labels_custom(percent_delta, binner_settings)

search_history = pd.read_csv(directory + google_data_filename)
google = search_history.as_matrix()
#X_data['Up_Label'] = (X_data["Close"] > X_data["Open"]) # no longer used, binary classification doesn't work well
X_daily['Google'] = google

X = X_daily.as_matrix()
Y = Y_daily.as_matrix()

m = int((floor(1000/batch_size))*batch_size)
n = int((floor(70/batch_size))*batch_size)

# Add Google daily search history as an input
search_history = pd.read_csv(directory + google_data_filename)
X2 = rearrange_training_data(X, timesteps, X_width)
X_train = X2[133-1-timesteps:1093-1-timesteps,:,:]
X_dev = X2[10-1-timesteps:106-1-timesteps,:,:]
print(len(X_dev))

Y_train = Y[133:1093,:]
Y_dev = Y[10:106,:]
print(len(Y_dev))
plt.bar(linspace(0, len(binner_settings), len(binner_settings)+1), np.sum(Y, axis = 0)/np.sum(np.sum(Y, axis = 0)))
plt.ylabel('Frequency (%)')
plt.xlabel('Bin Number')
plt.show()
model = price_data_binned_model_w_dropout(timesteps, X_width, n_a, len(binner_settings)+1, batch_size, loss, 0.5)
model.summary()
training = model.fit(X_train, Y_train, epochs = 1000, batch_size = batch_size, validation_data=(X_dev, Y_dev))
generate_plots(training)

In [None]:
plt.plot(normalize(search_history['Search History'].as_matrix()))
plt.plot(normalize(X_daily['Open'].as_matrix()))
#plt.ylabel('Loss')
plt.xlabel('Time')
plt.legend(['Google Searches', 'BTC Price'], loc='upper right')
plt.show()

## We find that BTC price and google searches follow each other very closely. Not great as a predictive input

In [None]:
## Define Model
#  Add more layers to the model. Add regularization in an attempt to reduce variance
def price_data_binned_model_w_dropout_and_regularization(timesteps, input_dim, n_a, number_of_bins, batch_size, loss, drop_rate):
    # Import required functions from Keras
    from keras.models import Sequential
    from keras.layers import Dense, Activation, BatchNormalization, LSTM, Dropout
    from keras import regularizers
    
    model = Sequential()
    model.add(BatchNormalization(batch_input_shape = (batch_size, timesteps, input_dim)))
    model.add(LSTM(n_a, return_sequences = True, stateful = True, recurrent_dropout = drop_rate, activity_regularizer = regularizers.l2(0.01)))
    model.add(BatchNormalization())
    model.add(LSTM(n_a, return_sequences = True, recurrent_dropout = drop_rate, activity_regularizer = regularizers.l2(0.01)))
    model.add(BatchNormalization())
    model.add(LSTM(n_a, recurrent_dropout = drop_rate, activity_regularizer = regularizers.l2(0.01)))
    model.add(BatchNormalization())
    model.add(Dense(number_of_bins, activation = 'softmax', activity_regularizer = regularizers.l2(0.01)))
    model.compile(loss = loss, optimizer = 'rmsprop', metrics = ['accuracy'])
    
    return model

In [None]:
## Define Model
#  Add more layers to the model. Regularizatio without dropout. Similar performance. Still big gap between training and dev performance
def price_data_binned_model_w_regularization(timesteps, input_dim, n_a, number_of_bins, batch_size, loss):
    # Import required functions from Keras
    from keras.models import Sequential
    from keras.layers import Dense, Activation, BatchNormalization, LSTM, Dropout
    from keras import regularizers
    
    model = Sequential()
    model.add(BatchNormalization(batch_input_shape = (batch_size, timesteps, input_dim)))
    model.add(LSTM(n_a, return_sequences = True, stateful = True, activity_regularizer = regularizers.l1(0.01)))
    model.add(BatchNormalization())
    model.add(LSTM(n_a, return_sequences = True, activity_regularizer = regularizers.l1(0.01)))
    model.add(BatchNormalization())
    model.add(LSTM(n_a, activity_regularizer = regularizers.l1(0.01)))
    model.add(BatchNormalization())
    model.add(Dense(number_of_bins, activation = 'softmax', activity_regularizer = regularizers.l1(0.01)))
    model.compile(loss = loss, optimizer = 'rmsprop', metrics = ['accuracy'])
    
    return model

In [None]:
# To test effect of feeding our NN with sentiment analysis of twitter data
# Will use vaderSentiment to look at sentiment of bitcion related tweets
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import math

analyzer = SentimentIntensityAnalyzer()
vs = analyzer.polarity_scores("this is a good example")
print(str(vs))
print(str(vs["compound"]))

directory = r"C:\\Users\\Dante\Documents\\Stanford\\Coursework\\Winter 2018\\CS230\\Project\\Sentiment\\"

In [None]:
# run vaderSentiment of selected tweets. For this example, we chose tweets in November 2017
total = []
j=1
for t in range(1,29):
    tweets = pd.read_csv(directory + "2017-11-" + str(t) + ".csv", delimiter = ";")
    tweets['text'].astype('str')
    scores = []
    for tweet in tweets['text']:
        vs = analyzer.polarity_scores(tweet)
        scores.append(vs["compound"])
        df = pd.DataFrame(np.array(scores),columns = ["compound"])
    total_score = df["compound"].sum()/20
    total.append(total_score)

total_sentiment = pd.DataFrame(np.array(total), columns = ["compound"])    
print(total_sentiment)

total_sentiment.to_csv("C:\\Users\\Dante\\Documents\\Stanford\\Coursework\\Winter 2018\\CS230\\Project\\total_sentiment.csv")

scores = []
for tweet in tweets['text']:
    vs = analyzer.polarity_scores(tweet)
    scores.append(vs["compound"])
df = pd.DataFrame(np.array(scores),columns = ["compound"])
x = df["compound"].sum()/20
print(df)

In [None]:
# plot distrubtuion of sentiment
plt.bar(range(1,total_sentiment['compound'].size+1),total_sentiment['compound'], color = 'g', linewidth = 2.0)
plt.ylabel('Sentiment',fontsize=12)
plt.xlabel('Day in November',fontsize=12)
plt.show

In [None]:
# This cell runs the model with Twitter Seniment Analysis for 28 days of Tweets
# Didn't improve large gap between train/dev set. Larger amount of tweets could improve this
## Inputs ##
timesteps = 2
n_a = 64
X_width = 6
batch_size = 6
binner_settings = [-0.025, -0.01, -0.003, 0.003, 0.01, 0.025]
loss = "categorical_crossentropy"

BTC_Price = import_data_file(directory, bitcoin_data_filename)
Twitter_sentiment = pd.read_csv(directory + "/total_sentiment.csv")

X_daily = generate_daily_dataset(BTC_Price, 1093)
delta = X_daily["Close"] - X_daily["Open"]
percent_delta = np.divide(delta, X_daily["Open"])
Y_daily = generate_binned_labels_custom(percent_delta, binner_settings)

X = X_daily.as_matrix()
Y = Y_daily.as_matrix()

# Get one month to line up with Twitter Data
X_one_month_dummy = np.zeros((28, X_width + 1))
X_one_month = np.zeros((28, X_width + 2))
j = 0
for i in range (0,len(X)):
    if X[i, 0] >= 1509504315 and X[i, 0] <= 1511923515:
        X_one_month_dummy[j, :] = X[i, :]
        j += 1
        
X_one_month[:, 0:7] = X_one_month_dummy    
X_one_month[:, X_width+1] = Twitter_sentiment["compound"]

X = rearrange_training_data(X_one_month, timesteps, X_width)
X_train = X[0:18,:,:]
X_dev = X[19:25,:,:]

Y_train = Y[1+timesteps:1+timesteps+18,:]
Y_dev = Y[19+1+timesteps:25+1+timesteps,:]

print(X_train)

plt.bar(linspace(0, len(binner_settings), len(binner_settings)+1), np.sum(Y, axis = 0)/np.sum(np.sum(Y, axis = 0)))
plt.ylabel('Frequency (%)')
plt.xlabel('Bin Number')
plt.show()

model = price_data_binned_model(timesteps, 6, n_a, len(binner_settings)+1, batch_size, loss)
model.summary()
training = model.fit(X_train, Y_train, epochs = 1000, batch_size = batch_size, validation_data=(X_dev, Y_dev))
generate_plots(training)