### Import libraries

In [3]:
import os
import numpy as np 
import pandas as pd 
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, LSTM
from sklearn.metrics import mean_absolute_error
from tensorflow.keras import layers

from datetime import datetime
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

### Import data

In [4]:
crypto_df = pd.read_csv("../input/g-research-crypto-forecasting/train.csv") 

In [5]:
crypto_df.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [6]:
asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')

In [7]:
asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
3,5,1.386294,EOS.IO
4,7,2.079442,Ethereum Classic
5,6,5.894403,Ethereum
6,9,2.397895,Litecoin
7,11,1.609438,Monero
8,13,1.791759,TRON
9,12,2.079442,Stellar


In [8]:
# Select Asset_ID = 6 for Ethereum
crypto_df = crypto_df[crypto_df["Asset_ID"]==6] 
crypto_df.info(show_counts =True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956200 entries, 5 to 24236799
Data columns (total 10 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   timestamp  1956200 non-null  int64  
 1   Asset_ID   1956200 non-null  int64  
 2   Count      1956200 non-null  float64
 3   Open       1956200 non-null  float64
 4   High       1956200 non-null  float64
 5   Low        1956200 non-null  float64
 6   Close      1956200 non-null  float64
 7   Volume     1956200 non-null  float64
 8   VWAP       1956200 non-null  float64
 9   Target     1955860 non-null  float64
dtypes: float64(8), int64(2)
memory usage: 164.2 MB


### Preprocess data

In [9]:
df = crypto_df.copy()

In [10]:
# fill missing values 
df = df.reindex(range(df.index[0],df.index[-1]+60,60),method='pad')
df = df.fillna(0)

In [11]:
# rename column timestamp to Date 
df.rename({'timestamp': 'Date'}, axis=1, inplace=True)

# rename Close to Price
df.rename(columns={'Close': 'Price'}, inplace=True)

In [12]:
# timestamp conversion
df.Date = df.Date.apply(lambda d: datetime.fromtimestamp(int(d)).strftime('%Y-%m-%d'))

In [13]:
# set index
df.set_index('Date', inplace=True)

In [14]:
# Convert to date array
timesteps = df.index.to_numpy()
prices = df['Price'].to_numpy()

timesteps[:10], prices[:10]

(array(['2018-01-01', '2018-01-01', '2018-01-01', '2018-01-01',
        '2018-01-01', '2018-01-01', '2018-01-01', '2018-01-01',
        '2018-01-01', '2018-01-01'], dtype=object),
 array([738.5075, 735.09  , 734.8025, 731.82  , 732.9325, 732.3425,
        731.8225, 732.1325, 732.605 , 729.415 ]))

### Functions

In [15]:
# Create a function to implement a ModelCheckpoint callback with a specific filename 
def create_model_checkpoint(model_name, save_path="model_experiments"):
  return tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(save_path, model_name), 
                                            verbose=0, 
                                            save_best_only=True) 

In [16]:
# Create a function which uses a list of trained models to make and return a list of predictions
def make_ensemble_preds(ensemble_models, data):
  ensemble_preds = []
  for model in ensemble_models:
    preds = model.predict(data) # make predictions with current ensemble model
    ensemble_preds.append(preds)
  return tf.constant(tf.squeeze(ensemble_preds))

In [17]:
# Function for evaluation
def evaluate_preds(y_true, y_pred):
  # Make sure float32 (for metric calculations)
  y_true = tf.cast(y_true, dtype=tf.float32)
  y_pred = tf.cast(y_pred, dtype=tf.float32)

  # Calculate metrics
  mae = tf.keras.metrics.mean_absolute_error(y_true, y_pred)
  mse = tf.keras.metrics.mean_squared_error(y_true, y_pred) 
  rmse = tf.sqrt(mse)
  mape = tf.keras.metrics.mean_absolute_percentage_error(y_true, y_pred)
  
  return {"mae": mae.numpy(),
          "mse": mse.numpy(),
          "rmse": rmse.numpy(),
          "mape": mape.numpy()}

### Modeling: Ensemble Algorithm

In [18]:
HORIZON = 1
WINDOW_SIZE = 7 

In [19]:
# Add windowed columns
ethereum_prices = df.copy()

for i in range(WINDOW_SIZE):
  ethereum_prices[f"Price+{i+1}"] = ethereum_prices["Price"].shift(periods=i+1)
ethereum_prices.drop(['Asset_ID', 'Count', 'Open', 'High', 'Low', 'Volume', 'VWAP', 'Target'], axis=1, inplace=True)
ethereum_prices.dropna().head()

Unnamed: 0_level_0,Price,Price+1,Price+2,Price+3,Price+4,Price+5,Price+6,Price+7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01,732.1325,731.8225,732.3425,732.9325,731.82,734.8025,735.09,738.5075
2018-01-01,732.605,732.1325,731.8225,732.3425,732.9325,731.82,734.8025,735.09
2018-01-01,729.415,732.605,732.1325,731.8225,732.3425,732.9325,731.82,734.8025
2018-01-01,731.32,729.415,732.605,732.1325,731.8225,732.3425,732.9325,731.82
2018-01-01,733.5625,731.32,729.415,732.605,732.1325,731.8225,732.3425,732.9325


In [20]:
# Create features and labels
X = ethereum_prices.dropna().drop("Price", axis=1)
y = ethereum_prices.dropna()["Price"]

# Make train and test sets
split_size = int(len(X) * 0.8)
X_train, y_train = X[:split_size], y[:split_size]
X_test, y_test = X[split_size:], y[split_size:]
len(X_train), len(y_train), len(X_test), len(y_test)

(323152, 323152, 80789, 80789)

In [22]:
# Turn train and test arrays into tensor Datasets
train_features_dataset = tf.data.Dataset.from_tensor_slices(X_train)
train_labels_dataset = tf.data.Dataset.from_tensor_slices(y_train)

test_features_dataset = tf.data.Dataset.from_tensor_slices(X_test)
test_labels_dataset = tf.data.Dataset.from_tensor_slices(y_test)

# Combine features & labels
train_dataset = tf.data.Dataset.zip((train_features_dataset, train_labels_dataset))
test_dataset = tf.data.Dataset.zip((test_features_dataset, test_labels_dataset))

# Batch and prefetch for optimal performance
BATCH_SIZE = 1024 
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

train_dataset, test_dataset

(<PrefetchDataset shapes: ((None, 7), (None,)), types: (tf.float64, tf.float64)>,
 <PrefetchDataset shapes: ((None, 7), (None,)), types: (tf.float64, tf.float64)>)

In [23]:
def get_ensemble_models(horizon=HORIZON, 
                        train_data=train_dataset,
                        test_data=test_dataset,
                        num_iter=10, 
                        num_epochs=100, 
                        loss_fns=["mae", "mse", "mape"]):
  """
  Returns a list of num_iter models each trained on MAE, MSE and MAPE loss.

  """
  # Create empty for trained ensemble models
  ensemble_models = []

  # Create num_iter number of models per loss function
  for i in range(num_iter):
    
    # Build and fit a new model with a different loss function
    for loss_function in loss_fns:
      print(f"Optimizing model by reducing: {loss_function} for {num_epochs} epochs, model number: {i}")

      # Construct a simple model 
      model = tf.keras.Sequential([
        layers.Dense(128, kernel_initializer="he_normal", activation="relu"), 
        layers.Dense(128, kernel_initializer="he_normal", activation="relu"),
        layers.Dense(HORIZON)                                 
      ])

      # Compile simple model with current loss function
      model.compile(loss=loss_function,
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=["mae", "mse"])
      
      # Fit model
      model.fit(train_data,
                epochs=num_epochs,
                verbose=0,
                validation_data=test_data,
                # callbacks
                callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss",
                                                            patience=200,
                                                            restore_best_weights=True),
                           tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss",
                                                                patience=100,
                                                                verbose=1)])
      
      # Append fitted model to list of ensemble models
      ensemble_models.append(model)

  return ensemble_models 

In [25]:
%%time
# Get list of trained ensemble models
ensemble_models = get_ensemble_models(num_iter=5,
                                      num_epochs=1000)

Optimizing model by reducing: mae for 1000 epochs, model number: 0

Epoch 00420: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.

Epoch 00520: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Optimizing model by reducing: mse for 1000 epochs, model number: 0
Optimizing model by reducing: mape for 1000 epochs, model number: 0

Epoch 00292: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.

Epoch 00392: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Optimizing model by reducing: mae for 1000 epochs, model number: 1

Epoch 00287: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.

Epoch 00387: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Optimizing model by reducing: mse for 1000 epochs, model number: 1
Optimizing model by reducing: mape for 1000 epochs, model number: 1

Epoch 00232: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.

Epoch 00483: ReduceLR

### Prediction inference

In [26]:
# Create a list of ensemble predictions
ensemble_preds = make_ensemble_preds(ensemble_models=ensemble_models,
                                     data=test_dataset)
ensemble_preds

<tf.Tensor: shape=(15, 80789), dtype=float32, numpy=
array([[1226.9666, 1231.521 , 1229.311 , ..., 2970.6096, 2974.3499,
        2963.5574],
       [1227.8707, 1231.7515, 1230.0686, ..., 2969.5212, 2975.2832,
        2961.4155],
       [1226.419 , 1232.5073, 1229.5392, ..., 2969.4329, 2977.9768,
        2961.433 ],
       ...,
       [1226.9608, 1232.4658, 1229.1375, ..., 2970.5618, 2977.236 ,
        2961.487 ],
       [1227.7007, 1232.9047, 1229.5829, ..., 2970.9543, 2976.2964,
        2962.1074],
       [1226.7943, 1231.0964, 1229.446 , ..., 2969.2341, 2974.8362,
        2962.9832]], dtype=float32)>

### Evaluation

In [27]:
# Evaluate ensemble model(s) predictions
ensemble_results = evaluate_preds(y_true=y_test,
                                  y_pred=np.median(ensemble_preds, axis=0)) 
ensemble_results

{'mae': 5.289649, 'mse': 76.13567, 'rmse': 8.725575, 'mape': 0.22080597}