In [12]:
import sys
import os
import pandas as pd
import requests
import json
import pickle
import warnings
import numpy as np
import tensorflow as tf
import seaborn as sns
import pandas as pd
from tqdm import tqdm
import dateutil.parser
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from plotly.offline import init_notebook_mode, iplot
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

if not sys.warnoptions:
    warnings.simplefilter('ignore')
sns.set()
tf.compat.v1.random.set_random_seed(1234)
init_notebook_mode(connected=True)

## Cryptocurrency Price Prediction 

I use Long short-term memory (LSTM) neural network forecast cryptocurrency prices. The price history is obtained from the CoinGecko API. The results are saved and analyzed. The same model could be used for forecasting any timeseries data (e.g. stock prices, weather data, etc...)

In [3]:
# Get list of coins from api.coingecko.com
url = "https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd"
coins = requests.get(url).json()
id_array = [x['id'] for x in coins][0:25]
atl_array = []
for x in coins:
    d = dateutil.parser.parse(x['atl_date'])
    s = (d.strftime('%d/%m/%Y'))  
    datetime_object = datetime.strptime(s, '%d/%m/%Y')
    atl_array.append(datetime_object)

In [9]:
# Set model parameters
simulation_size = 3
num_layers = 1      
size_layer = 128    
timestamp = 5       
epoch = 300
dropout_rate = 0.8
test_size = 30 # Determines up how many days ahead to forecast
learning_rate = 0.01

class Model:
    def __init__(
        self,
        learning_rate,
        num_layers,
        size,
        size_layer,
        output_size,
        forget_bias = 0.1,
    ):
        def lstm_cell(size_layer):
            return tf.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple = False)
        rnn_cells = tf.contrib.rnn.MultiRNNCell(
            [lstm_cell(size_layer) for _ in range(num_layers)],
            state_is_tuple = False,
        )
        self.X = tf.placeholder(tf.float32, (None, None, size))
        self.Y = tf.placeholder(tf.float32, (None, output_size))
        drop = tf.contrib.rnn.DropoutWrapper(
            rnn_cells, output_keep_prob = forget_bias
        )
        self.hidden_layer = tf.placeholder(
            tf.float32, (None, num_layers * 2 * size_layer)
        )
        self.outputs, self.last_state = tf.nn.dynamic_rnn(
            drop, self.X, initial_state = self.hidden_layer, dtype = tf.float32
        )
        self.logits = tf.layers.dense(self.outputs[-1], output_size)
        self.cost = tf.reduce_mean(tf.square(self.Y - self.logits))
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
            self.cost
        )
        
def calculate_accuracy(real, predict):
    real = np.array(real) + 1
    predict = np.array(predict) + 1
    percentage = 1 - np.sqrt(np.mean(np.square((real - predict) / real)))
    return percentage * 100

def anchor(signal, weight):
    buffer = []
    last = signal[0]
    for i in signal:
        smoothed_val = last * weight + (1 - weight) * i
        buffer.append(smoothed_val)
        last = smoothed_val
    return buffer

def forecast():
    tf.compat.v1.reset_default_graph()
    modelnn = Model(
        learning_rate, num_layers, df_log.shape[1], size_layer, df_log.shape[1], dropout_rate
    )
    sess = tf.compat.v1.InteractiveSession()
    sess.run(tf.compat.v1.global_variables_initializer())
    date_ori = pd.to_datetime(df.iloc[:, 0]).tolist()

    pbar = tqdm(range(epoch), desc = 'train loop')
    for i in pbar:
        init_value = np.zeros((1, num_layers * 2 * size_layer))
        total_loss, total_acc = [], []
        for k in range(0, df_train.shape[0] - 1, timestamp):
            index = min(k + timestamp, df_train.shape[0] - 1)
            batch_x = np.expand_dims(
                df_train.iloc[k : index, :].values, axis = 0
            )
            batch_y = df_train.iloc[k + 1 : index + 1, :].values
            logits, last_state, _, loss = sess.run(
                [modelnn.logits, modelnn.last_state, modelnn.optimizer, modelnn.cost],
                feed_dict = {
                    modelnn.X: batch_x,
                    modelnn.Y: batch_y,
                    modelnn.hidden_layer: init_value,
                },
            )        
            init_value = last_state
            total_loss.append(loss)
            total_acc.append(calculate_accuracy(batch_y[:, 0], logits[:, 0]))
        pbar.set_postfix(cost = np.mean(total_loss), acc = np.mean(total_acc))
    
    future_day = test_size

    output_predict = np.zeros((df_train.shape[0] + future_day, df_train.shape[1]))
    output_predict[0] = df_train.iloc[0]
    upper_b = (df_train.shape[0] // timestamp) * timestamp
    init_value = np.zeros((1, num_layers * 2 * size_layer))

    for k in range(0, (df_train.shape[0] // timestamp) * timestamp, timestamp):
        out_logits, last_state = sess.run(
            [modelnn.logits, modelnn.last_state],
            feed_dict = {
                modelnn.X: np.expand_dims(
                    df_train.iloc[k : k + timestamp], axis = 0
                ),
                modelnn.hidden_layer: init_value,
            },
        )
        init_value = last_state
        output_predict[k + 1 : k + timestamp + 1] = out_logits

    if upper_b != df_train.shape[0]:
        out_logits, last_state = sess.run(
            [modelnn.logits, modelnn.last_state],
            feed_dict = {
                modelnn.X: np.expand_dims(df_train.iloc[upper_b:], axis = 0),
                modelnn.hidden_layer: init_value,
            },
        )
        output_predict[upper_b + 1 : df_train.shape[0] + 1] = out_logits
        future_day -= 1
        date_ori.append(date_ori[-1] + timedelta(days = 1))

    init_value = last_state
    
    for i in range(future_day):
        o = output_predict[-future_day - timestamp + i:-future_day + i]
        out_logits, last_state = sess.run(
            [modelnn.logits, modelnn.last_state],
            feed_dict = {
                modelnn.X: np.expand_dims(o, axis = 0),
                modelnn.hidden_layer: init_value,
            },
        )
        init_value = last_state
        output_predict[-future_day + i] = out_logits[-1]
        date_ori.append(date_ori[-1] + timedelta(days = 1))
    
    output_predict = minmax.inverse_transform(output_predict)
    deep_future = anchor(output_predict[:, 0], 0.4)
    
    return deep_future

# Returns sell date to maximize profits
def get_sell_date(forecast):
    mx = forecast.max()
    itemindex = (np.where(forecast==mx))[0][0]
    return itemindex

# Returns percent max profit
def get_max_profit(current_price, forecast):
    mx = forecast.max()
    dif = mx - current_price
    mx_profit = round(100*(dif/current_price),2)
    
    return mx_profit,

# Returns percent profit per day
def get_max_per_day(mx_profit, forecast):
    return round((mx_profit/get_sell_date(forecast)),2)
    
# Open data computed in 'data_collection.ipynb' and returns array of data objects for each coin
def open_data(pathname, sort_key = 'daily_profit'):
    arr = []
    for file in os.listdir(pathname):
        obj = pickle.load(open(pathname + '/' + file, "rb" ))
        arr.append(obj)
    return sorted(arr, key=lambda k: k[sort_key]) 

# Return a list of all coin names (top 25 coins)
def coin_list():
    url = "https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd"
    coins = requests.get(url).json()
    return [x['id'] for x in coins][0:25]

# Get up to date timeseries data for a specified coin
def get_present_data(coin, max_date = 365*3):
    url = f'https://api.coingecko.com/api/v3/coins/{coin}/market_chart?vs_currency=usd&days={max_date}'
    request = requests.get(url)
    prices = request.json()['prices']
    daily = {price[0]: price[1:][0] for price in prices}
    return daily

# Get computed coin data for a specific coin
def get_coin_data(arr, coin = 'bitcoin'):
    return [x for x in arr if x["name"] == coin][0]
    
# Plots top 25 coins on XY plot where X is the estimated profit and Y the accuracy    
def plot_best_data(arr, attribute = 'max_profit'):
    fig = go.Figure()
    for item in arr:
        fig.add_trace(go.Scatter(x=[item[attribute]], y=[item['accuracy']],
                            mode='markers',
                            opacity=0.85,
                            name=item['name'],
                            marker=dict(size=[20])))
    fig.update_layout(
            autosize=False,
            width=800,
            height=650,
            title='Top 25 Coins Profit Margin',
            xaxis_title="Profit (%)",
            yaxis_title="Accuracy (%)",
            font=dict(
                family="Courier New, monospace",
                size=18,
                color="#7f7f7f"))
    
    return fig
    
# Plots up-to-date coin price timeseries, as well as predicted price
# present = up to date price
# true = real price at date of simulation
# simulation = computed price from neural network
# prediction = predicted price in next N days, where N is the test_size
def pretty_plot(data, coin, days_before = 120):
    DATA = get_coin_data(data, coin)
    day_diff = (datetime.today().date() - DATA['X_true'][-1].date()).days
    prediction_days = DATA['meta_data']['test_size']
    simulation_date = datetime.strptime(DATA['meta_data']['created_at'][0:10], '%Y-%m-%d')

    Y_real = DATA['Y_true'][-days_before::]
    Y_pred = DATA['Y_pred'][-prediction_days - days_before::]
    Y_new = list(get_present_data(coin, max_date = 365*3 + day_diff -1).values())[-day_diff-days_before::]
    
    T_real = np.arange(len(Y_real))
    T_pred = np.arange(len(Y_pred))
    T_new = np.arange(len(Y_new))


    fig = go.Figure()
    
    fig.add_trace(go.Scatter(x=T_new, y=Y_new,
                        mode='lines',
                        line=dict(color="red"), 
                        name=('present    (' + 
         (datetime.today() - timedelta(days=min(days_before, 365*3))).strftime("%Y/%m/%d")
          + '-' +
         datetime.today().strftime("%Y/%m/%d)")))
        )

    fig.add_trace(go.Scatter(x=T_real, y=Y_real,
                        mode='lines',
                        line=dict(color="blue"), 
                        name=('true       (' + 
         (datetime.today() - timedelta(days=min(days_before, 365*3))).strftime("%Y/%m/%d")
          + '-' +
         DATA['meta_data']['created_at'][0:10].replace('-', '/')) + ')') 
        )

    fig.add_trace(go.Scatter(x=T_pred, y=Y_pred,
                        visible='legendonly',
                        mode='lines',
                        line=dict(color="green"), 
                        name=('simulation (' + 
         (datetime.today() - timedelta(days=min(days_before, 365*3))).strftime("%Y/%m/%d")
          + '-' +
         (simulation_date + timedelta(days=prediction_days)).strftime("%Y/%m/%d)")))
        )
    
    fig.add_trace(go.Scatter(x=T_pred[-prediction_days::], y=Y_pred[-prediction_days::],
                    mode='lines',
                    line=dict(color="black"), 
                    name=('prediction (' + 
         simulation_date.strftime("%Y/%m/%d")
          + '-' +
         (simulation_date + timedelta(days=prediction_days)).strftime("%Y/%m/%d)")))
        )


    fig.update_layout(
        autosize=False,
        width=800,
        height=550,
        title=DATA['name'],
        xaxis_title="Time Step (Days)",
        yaxis_title="Value (US Dollars)",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="#7f7f7f"
        ),
        legend=dict(
            yanchor="top",
            y=-0.25,
            xanchor="left",
            x=0.01
        ))

    fig.show()

### Model Training

In [None]:
data_directory = os.listdir('./data')
today_date = datetime.today().strftime('%m-%d-%y')

for directory in data_directory:
    if not os.listdir('./data/' + directory):
        os.rmdir('./data/' + directory)

data_directory = os.listdir('./data')
path = './data/' + today_date
computed = []
if today_date in data_directory:
    for file in os.listdir(path):
        obj = pickle.load(open(path + file, "rb" ))
        computed.append(obj)
else:
    try:
        os.mkdir(path)
    except OSError:
        print ("Creation of the directory %s failed" % path)
    else:
        print ("Successfully created the directory %s " % path)
    
skip = [x['name'] for x in computed]

for idx, item_id in enumerate(id_array):
    if item_id in skip:
        continue
    max_date = ((datetime.today() - max(datetime.today() - timedelta(days= 365 * 3), atl_array[idx])).days)
    url = f'https://api.coingecko.com/api/v3/coins/{id_array[idx]}/market_chart?vs_currency=usd&days={max_date}'
    request = requests.get(url)
    prices = request.json()['prices']
    daily = {price[0]: price[1:][0] for price in prices}    

    T = []
    X = []
    Y = []
    for time, key in enumerate(daily):
        T.append(float(idx))
        X.append(datetime.fromtimestamp(int(key)/1000.0))
        Y.append(float(daily[key]))


    df = pd.DataFrame(dict(price=Y), index=T, columns=['price'])
    Y_true = df["price"].values
    current_price = Y_true[-1]                
    print("Item: " + item_id + '(' + str(idx) + ')')
    print("Price: " + str(current_price))                  
    X_true = np.arange(len(Y_true))

    minmax = MinMaxScaler().fit(df.iloc[:, 0:1].astype('float32')) # Close index
    df_log = minmax.transform(df.iloc[:, 0:1].astype('float32')) # Close index
    df_log = pd.DataFrame(df_log)


    df_train = df_log
    df.shape, df_train.shape

    results = []
    for i in range(simulation_size):
        print('simulation %d'%(i + 1))
        results.append(forecast())

    # Filter out outliers if model has unstable gradient
    accepted_results = []
    for r in results:
        if (np.array(r[-test_size:]) < np.min(df['price'])).sum() == 0 and \
        (np.array(r[-test_size:]) > np.max(df['price']) * 2).sum() == 0:
            accepted_results.append(r)

    if accepted_results:
        mean = np.zeros(len(accepted_results[0]))
        for r in (accepted_results):
            for i, x in enumerate(r):
                mean[i] += x/len(accepted_results)



        accuracies = [calculate_accuracy(df['price'].values, r[:-test_size]) for r in accepted_results]                       

        Y_pred = mean
        X_pred = np.arange(len(Y_pred))

        future = Y_pred[len(Y_true):-1]

        mx_profit = get_max_profit(current_price, future)[0]
        daily_profit = get_max_per_day(mx_profit, future)
        
        # Create object with prediction data and save it
        obj = ({"name": item_id,
                "max_profit": mx_profit,
                "daily_profit": daily_profit, 
                "sell_date": get_sell_date(future),
                "accuracy": round(sum(accuracies) / len(accuracies), 2), 
                "X_true": X,
                "Y_true": Y_true,
                "Y_pred": Y_pred,
                "Y_future": future,
                "meta_data": {'simulation_size': simulation_size,
                              'num_layers': num_layers,
                              'size_layer': size_layer,
                              'timestamp': timestamp,
                              'num_epoch': epoch,
                              'dropout_rate': dropout_rate,
                              'test_size': test_size,
                              'learning_rate': learning_rate,
                              'created_at': datetime.today().strftime('%Y-%m-%d-%H:%M:%S')
                             }
                })
        print("Simulation Result: ")
        print(obj['accuracy'])
        print(obj['max_profit'])
        with open(f'./best/{item_id}.pkl', 'wb') as f: 
            pickle.dump(obj, f)

### Data Visualisation

In [13]:
data = open_data('./data/08-20-20', sort_key = 'name')
plot_best_data(data)

In [14]:
pretty_plot(data, 'monero', days_before = 180 )