In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from collections import deque
import random
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

### Test read some data

In [None]:
df = pd.read_csv("crypto_data/LTC-USD.csv", names=['time', 'low', 'high', 'open', 'close', 'volume'])

print(df.head())

### define some global vars

In [3]:
SEQ_LEN = 60 #num mintutes of input sequence
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = "LTC-USD"
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{SEQ_LEN}--SEQ--{FUTURE_PERIOD_PREDICT}--PRED--{int(time.time())}"

In [4]:
def classify(current,future):
    ####
    # create class labels
    ####
    if float(future) > float(current):
        #buy
        return 1
    else:
        #don't buy/sell
        return 0
    
def preprocess_df(df):
    #####
    # normalize data
    #####
    
    df = df.drop('future',1)
    for col in df.columns:
        if col != 'target':
            df[col] = df[col].pct_change() #convert prices to pct change from previous value
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(df[col].values) #normalize
            
    df.dropna(inplace=True)
    
    sequential_data = []
    prev_days = deque(maxlen=SEQ_LEN)

    for row in df.values:
        targ = row[-1]
        feats = row[:-1]
        
        #-1 omits target col
        prev_days.append([col for col in feats])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), targ])
            
    random.shuffle(sequential_data)
    
    buys = []
    sells = []
    
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq,target])
        elif target == 1:
            buys.append([seq,target])
            
    #shuffle orderings
    random.shuffle(buys)
    random.shuffle(sells)
    
    lower = min(len(buys), len(sells))
    
    #balance training data with equal class populations
    buys=buys[:lower]
    sells = sells[:lower]
    
    sequential_data = buys+sells
    
    #shuffle so it's not [buy1,buy2,...,buyN,sells1,sells2,sells3,..sellsN]
    random.shuffle(sequential_data)
    
    X = []
    y = []
    
    for seq,target in sequential_data:
        X.append(seq)
        y.append(target)
        
    return np.array(X), y


In [5]:
main_df = pd.DataFrame() # begin empty

ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 files we want to consider
for ratio in ratios:  # begin iteration
    print(ratio)
    dataset = f'crypto_data/{ratio}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
print(main_df.head())  # how did we do??

BTC-USD
LTC-USD
BCH-USD
ETH-USD
            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  
time                                                                      
1528968720     870.859985       26.856577      486.01001       26.019083  
1528968780     870.099976        1.124300      486.00000        8.449400  
1528968840     870.789978        1.749862      485.75000       26.994646  
1528968900     870.000000        1.680500      486.00000    

In [6]:
#set future targets
main_df['future'] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)

#sanity check to be sure our 'future' column (ML target), is indeed FUTURE_PERIOD_PREDICT time steps ahead in the sequence
print(main_df[[f"{RATIO_TO_PREDICT}_close","future"]].head())

#Use classify function to set explicit target classification column. 
#Target = 1 when we should buy ie future value FUTURE_PERIOD_PREDICT steps ahead is higher than current value, target = 0 when same or lower
main_df['target'] = list(map(classify, main_df[f"{RATIO_TO_PREDICT}_close"],main_df['future']))

print(main_df[[f"{RATIO_TO_PREDICT}_close","future","target"]].head(10))


            LTC-USD_close     future
time                                
1528968720      96.660004  96.389999
1528968780      96.570000  96.519997
1528968840      96.500000  96.440002
1528968900      96.389999  96.470001
1528968960      96.519997  96.400002
            LTC-USD_close     future  target
time                                        
1528968720      96.660004  96.389999       0
1528968780      96.570000  96.519997       0
1528968840      96.500000  96.440002       0
1528968900      96.389999  96.470001       1
1528968960      96.519997  96.400002       0
1528969020      96.440002  96.400002       0
1528969080      96.470001  96.400002       0
1528969140      96.400002  96.400002       0
1528969200      96.400002  96.400002       0
1528969260      96.400002  96.449997       1


### Split train/test 

In [7]:
times = sorted(main_df.index.values)
last_5pct = times[-int(0.05*len(times))]

#split 
validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

In [8]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data feat size: {len(train_x)} validation: {len(validation_x)}")
print(f"[training]:: Don't buys: {train_y.count(0)} buys: {train_y.count(1)}")
print(f"[testing]:: Don't buys: {validation_y.count(0)} buys: {validation_y.count(1)}")

train data feat size: 77922 validation: 3860
[training]:: Don't buys: 38961 buys: 38961
[testing]:: Don't buys: 1930 buys: 1930


In [13]:
print(type(train_x))
print(type(np.asarray(train_y)))
print(type(validation_x))
print(type(validation_y))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'list'>


## Build Model 

In [9]:
model = Sequential()
model.add(LSTM(128,input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2,activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001,decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy',
             optimizer=opt,
             metrics=['accuracy'])

In [15]:
tensorboard = TensorBoard(log_dir=f'logs/{NAME}')
filepath = "RNN_Final-{epoch:02d}-{val_accuracy:.3f}"
checkpoint = ModelCheckpoint("models/{}.model".format(filepath,monitor='val_accuracy',verbose=1,save_best_only=True,mode='max'))

history = model.fit(
    train_x, np.asarray(train_y),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, np.asarray(validation_y)),
    callbacks=[tensorboard,checkpoint],
)    

Epoch 1/10
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/RNN_Final-01-0.569.model/assets
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
# Score model
score = model.evaluate(validation_x, np.asarray(validation_y), verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))

Test loss: 0.6761615872383118
Test accuracy: 0.564507782459259
INFO:tensorflow:Assets written to: models/60--SEQ--3--PRED--1606761152/assets


### run tensorboard

In [17]:
!tensorboard --logdir logs

W1130 13:35:08.832581 123145366757376 plugin_event_accumulator.py:322] Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.4.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C
