## Predictive Modeling of Cryptocurrency Prices
STA 208 - Spring 2021  
Group: Noah Perry

In [4]:
import math
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection, ensemble

os.chdir("C:/Users/noahj/Documents/UCD/2021 Spring Classes/STA 208/Project/STA208_FinalProject_NoahPerry/Data")
main = pd.read_pickle("main.pkl")

**Model: Random Forest**  
The second kind of model I test is random forest regression. I use the same nested cross validation method and performance metric as used for vector autoregression. Each model includes 1, 2, and 7 lags of log differenced Bitcoin and Ethereum prices as well as lags of log differenced price of the cryptocurrency that is the target (either Cardano or Monero).

As shown below, the random forest models involving the weekly and biweekly log differenced prices are the best performing models of all models testing for this project, correctly predict the sign of future price movements over 80 percent of the time for both Cardano and Monero.

*Note:* Each random forest model takes several minutes to run.

In [30]:
def rand_forest(features, target, lag_list):
    """
    features: features variables
    target: target variable
    lag_list: lags of features and target variables to include in model
    """
    varlist = features + [target]
    data = main[varlist]
    
    # Calculate lags and add all lags of features and target to X
    data2 = pd.DataFrame()
    for L in lag_list:
        data2[(target + "_lag" + str(L))] = data[target].shift(L)
        for f in features:
            data2[(f + "_lag" + str(L))] = data[f].shift(L) 
    
    data2[target] = data[target]
        # temporarily include target in data2 for dropping NA's
    
    # Restrict data to feature/target with shortest time series
    data2 = data2.dropna(axis = 0)
    
    # Reset index for TimeSeriesSplit()
    data2 = data2.reset_index(drop = True)
    
    # Define y and X
    y = data2[target]
    X = data2.drop(labels = target, axis = 1)
        # only include lags of target and features in X
    
    model = ensemble.RandomForestRegressor(criterion = "mae")
        # using Mean Absolute Error due to use case of investing -> mae is closer to a profitability measure than mse
        # using defaults otherwise
    
    # Back testing / cross validation
    splits = math.floor(X.shape[0] / 14)
    tscv_results = pd.DataFrame()
    tscv = model_selection.TimeSeriesSplit(n_splits = splits)
    for train_index, test_index in tscv.split(X):
        # Construct training and test sets
        X_tr, X_te = X.iloc[train_index,], X.iloc[test_index,]
        y_tr, y_te = y[train_index], y[test_index]
        
        # Fit vector autoregression model
        model.fit(X_tr, y_tr)
        
        # Make predictions
        pred = model.predict(X_te)
    
        # Performance metric: percent of predicted signs that match actual sign
        c = ["pred_" + target]
        pred_df = pd.DataFrame(pred, columns = c)  
        pred_df = pred_df.set_index(y_te.index)
        
        comparison = pred_df.merge(y_te, left_index = True, right_index = True)
        
        comparison["pred_sign_" + target] = np.where(comparison["pred_" + target] >= 0, "positive", "negative")
        comparison["actual_sign_" + target] = np.where(comparison[target] >= 0, "positive", "negative")
        comparison["same_sign_" + target] = (comparison["pred_sign_" + target] == comparison["actual_sign_" + target]).astype(int)

        tscv_results = tscv_results.append(comparison)
        
    return(tscv_results)

**Monero Models**

In [32]:
# Random forest with daily log differenced Monero price and 1, 2, and 7 lags of each feature and target variable
rf_xmr1 = rand_forest(["logdif_XBT", "logdif_ETH"], "logdif_XMR", lag_list = [1,2,7])
print("Proportion of Signs Predicted Correct: ",(rf_xmr1["same_sign_logdif_XMR"] == 1).sum() / rf_xmr1.shape[0])

Proportion Correct:  0.5048951048951049


In [33]:
# Random forest with weekly log differenced Monero price and 1, 2, and 7 lags of each feature and target variable
rf_xmr7 = rand_forest(["logdif7_XBT", "logdif7_ETH"], "logdif7_XMR", lag_list = [1,2,7])
print("Proportion Correct: ",(rf_xmr7["same_sign_logdif7_XMR"] == 1).sum() / rf_xmr7.shape[0])

Proportion Correct:  0.8108680310515173


In [34]:
# Random forest with biweekly log differenced Monero price and 1, 2, and 7 lags of each feature and target variable
rf_xmr14 = rand_forest(["logdif14_XBT", "logdif14_ETH"], "logdif14_XMR", lag_list = [1,2,7])
print("Proportion Correct: ",(rf_xmr14["same_sign_logdif14_XMR"] == 1).sum() / rf_xmr14.shape[0])

Proportion Correct:  0.8750882145377559


**Cardano Models**

In [35]:
# Random forest with daily log differenced Cardano price and 1, 2, and 7 lags of each feature and target variable
rf_ada1 = rand_forest(["logdif_XBT", "logdif_ETH"], "logdif_ADA", lag_list = [1,2,7])
print("Proportion Correct: ",(rf_ada1["same_sign_logdif_ADA"] == 1).sum() / rf_ada1.shape[0])

Proportion Correct:  0.5144230769230769


In [36]:
# Random forest with weekly log differenced Cardano price and 1, 2, and 7 lags of each feature and target variable
rf_ada7 = rand_forest(["logdif7_XBT", "logdif7_ETH"], "logdif7_ADA", lag_list = [1,2,7])
print("Proportion Correct: ",(rf_ada7["same_sign_logdif7_ADA"] == 1).sum() / rf_ada7.shape[0])

Proportion Correct:  0.8016826923076923


In [39]:
# Random forest with biweekly log differenced Cardano price and 1, 2, and 7 lags of each feature and target variable
rf_ada14 = rand_forest(["logdif14_XBT", "logdif14_ETH"], "logdif14_ADA", lag_list = [1,2,7])
print("Proportion Correct: ",(rf_ada14["same_sign_logdif14_ADA"] == 1).sum() / rf_ada14.shape[0])

Proportion Correct:  0.873015873015873
