# Predicting Crytocurrency Prices with RNNs

#### Import Packages

In [1]:
from sklearn import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf
import pandas as pd
import numpy as np
from collections import deque
import random
import os
import time
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

#### Let's look at one of the datasets. There are 3 more files for BTC, BCH and ETH.

In [2]:
df = pd.read_csv("crypto_data/LTC-USD.csv", names=['time', 'low', 'high', 'open', 'close', 'volume'])
df.head()

Unnamed: 0,time,low,high,open,close,volume
0,1528968660,96.580002,96.589996,96.589996,96.580002,9.6472
1,1528968720,96.449997,96.669998,96.589996,96.660004,314.387024
2,1528968780,96.470001,96.57,96.57,96.57,77.129799
3,1528968840,96.449997,96.57,96.57,96.5,7.216067
4,1528968900,96.279999,96.540001,96.5,96.389999,524.539978


#### Isolating close and volume data and clean data

In [3]:
main_df = pd.DataFrame()
ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]

for ratio in ratios:
    dataset = f'crypto_data/{ratio}.csv'
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])
    
    # Each dataset have its unique col name for close and volume
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True) # so we can join on time
    df = df.loc[:, [f"{ratio}_close", f"{ratio}_volume"]] # only keep price and volumne cols

    if len(main_df)==0: # account for first dataframe
        main_df = df
    else:
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)

#### This is how our cleaned data looks like

In [4]:
print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  
time                                                                      
1528968720     870.859985       26.856577      486.01001       26.019083  
1528968780     870.099976        1.124300      486.00000        8.449400  
1528968840     870.789978        1.749862      485.75000       26.994646  
1528968900     870.000000        1.680500      486.00000       77.355759  
1528968960     86

#### Whats our target?
We will use a sequence length of 60, amd a future prediction out of 3.
We will perform binary classification, which means if the price goes up in 3 minutes, then we buy and if it goes down in 3 minutes, then we do not buy/sell.

In [5]:
SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "LTC-USD"

In [6]:
# Create a column for 3rd future column
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
# Apply a mapping function to create our target column
classify = lambda current, future: 1 if float(future) > float(current) else 0
main_df['target'] = main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))

#### This is what we have processed so far

In [7]:
print(main_df.head(5))

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  \
time                                                                       
1528968720     870.859985       26.856577      486.01001       26.019083   
1528968780     870.099976        1.124300      486.00000        8.449400   
1528968840     870.789978        1.749862      485.75000       26.994646   
1528968900     870.000000        1.680500      486.00000       77.355759   
1528968960 

#### Taking a closer look at LTC-USD_close, future, target

In [8]:
print(main_df.loc[:, ["LTC-USD_close", "future", "target"]].head(10))

            LTC-USD_close     future  target
time                                        
1528968720      96.660004  96.389999       0
1528968780      96.570000  96.519997       0
1528968840      96.500000  96.440002       0
1528968900      96.389999  96.470001       1
1528968960      96.519997  96.400002       0
1528969020      96.440002  96.400002       0
1528969080      96.470001  96.400002       0
1528969140      96.400002  96.400002       0
1528969200      96.400002  96.400002       0
1528969260      96.400002  96.449997       1


#### Obtaining training and validation sets
For time-series prediction, especially stock prices, we need to be careful in how we build our test set.

We will use the last 5% as validation set, first 95% as training set

In [9]:
times = sorted(main_df.index.values)
last_5pct = times[-int(0.05*len(times))] # get time at 95 percentile
validation_main_df = main_df[(main_df.index >= last_5pct)]  # make the validation data contain time > 95 percentile
main_df = main_df[(main_df.index < last_5pct)]  # maindf contain time < 95 percentile

#### Normalizing our data

In [10]:
def preprocess_df(df):
    df = df.drop("future", 1)  # discard this, only needed for calculating target

    for col in df.columns:
        if col != "target":
            df[col] = df[col].pct_change() # interested in relative movement
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(df[col].values) # values normalized to between 0 and 1

    df.dropna(inplace=True)
    sequential_data = []  # list of sequences
    prev_days = deque(maxlen=SEQ_LEN)  # Each individual sequence, deque ensure len < SEQ_LEN

    for i in df.values:  # df.values convert to list of lists
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # deque is full and we have a sequence
            sequential_data.append([np.array(prev_days), i[-1]])

    random.shuffle(sequential_data)  # remove any semblance of bias
    
    # We need to ensure there is no class imbalance as that can skew our results
    buys = [[seq, target] for seq, target in sequential_data if target==0]
    sells = [[seq, target] for seq, target in sequential_data if target!=0]
    
    # Discard excess sequences if there are too many buy/sell sequences
    lower = min(len(buys), len(sells))
    buys = buys[:lower]
    sells = sells[:lower]

    sequential_data = buys+sells
    random.shuffle(sequential_data)  # so we don't get a chuck of buy sequences then a chuck of sell sequences
    
    return np.array([seq for seq, target in sequential_data]), [target for seq, target in sequential_data]

train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

#### Verify the counts

In [11]:
print(f"num train seqs: {len(train_x)}, num validation seqs: {len(validation_x)}")
print("\ntrain:")
print(f"dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print("\nvalidation:")
print(f"dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

num train seqs: 77922, num validation seqs: 3860

train:
dont buys: 38961, buys: 38961

validation:
dont buys: 1930, buys: 1930


#### Lets build the model

In [12]:
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}" # model name

In [13]:
model = Sequential()
model.add(CuDNNLSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(CuDNNLSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

W0826 21:03:18.205071  3860 deprecation.py:506] From C:\Users\AJL\Anaconda3\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [14]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

In [15]:
filepath = "RNN_Final"
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1,
                                                      save_best_only=True, mode='max'))
# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[checkpoint],
)

Train on 77922 samples, validate on 3860 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Valid loss:', score[0])
print('Valid accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))

Test loss: 0.6767151699164988
Test accuracy: 0.5676166
