# Cryptocurrency Trading via Multivariate AR (VAR)

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import statsmodels.api as sm
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import itertools
import os

In [2]:
data_directory = r"C:\Users\sb013698\Desktop\github\Machine Learning in Finance\Datasets"

- In this question, you model the price dynamics of multiple cryptocurrencies jointly via multivariate AR (Vector AR). 
- We will focus on the following assets only: BTC, ETH, LTC, SOL, AVAX, BNB.
- Since cryptocurrency markets are 7/24, we will use the latest data in UTC time as the close price.
- Our training period 2021-2022 and test period will be 2022-2023. Once predicted after training, our portfolio will be allocated to the top two cryptocurrencies with the highest return estimation.
- We will report the best model in terms of Information Ratio by considering the parameters p and q in ranges [0, 1, 5].

> **The Vector Autoregressive (VAR) model** is a statistical model used to capture the linear interdependencies among multiple time series. It extends the univariate autoregressive model to a multivariate framework.

In [3]:
# Define the list of cryptocurrencies and the date range
cryptos = ['BTC-USD', 'ETH-USD', 'LTC-USD', 'SOL-USD', 'AVAX-USD', 'BNB-USD']
start_date = "2021-01-01"
end_date = "2023-12-31"

# Initialize an empty dictionary to store 'Close' prices
crypto_close_prices = {}

# Fetch data for each cryptocurrency
for crypto in cryptos:
    try:
        # Download the historical data
        data = yf.download(crypto, start=start_date, end=end_date)
        if not data.empty:
            # Extract 'Close' prices and store in the dictionary
            crypto_close_prices[crypto] = data[['Close']]
            print(f"Fetched {len(data)} rows for {crypto}")
        else:
            print(f"No data available for {crypto}")
    except Exception as e:
        print(f"Failed to fetch data for {crypto}: {e}")

# Check if the dictionary is empty
if not crypto_close_prices:
    print("No data was fetched for any cryptocurrency.")
else:
    # Extract the time index (Date) from one of the DataFrames
    time_index = crypto_close_prices["BTC-USD"].index

    # Initialize an empty DataFrame to hold the final close prices
    close_prices_df = pd.DataFrame(index=time_index)

    # Add each cryptocurrency's Close prices to the DataFrame
    for crypto, df in crypto_close_prices.items():
        close_prices_df[crypto] = df['Close']

# Save the resulting DataFrame to a CSV file
close_prices_df.to_csv(os.path.join(data_directory, "crypto_close_prices.csv"))

print(close_prices_df.shape)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetched 1094 rows for BTC-USD
Fetched 1094 rows for ETH-USD



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Fetched 1094 rows for LTC-USD
Fetched 1094 rows for SOL-USD


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetched 1094 rows for AVAX-USD
Fetched 1094 rows for BNB-USD
(1094, 6)





In [4]:
close_prices_df.head(10)

Unnamed: 0_level_0,BTC-USD,ETH-USD,LTC-USD,SOL-USD,AVAX-USD,BNB-USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-01 00:00:00+00:00,29374.152344,730.367554,126.230347,1.842084,3.664823,37.90501
2021-01-02 00:00:00+00:00,32127.267578,774.534973,136.944885,1.799275,3.49494,38.241592
2021-01-03 00:00:00+00:00,32782.023438,975.50769,160.190582,2.161752,3.472944,41.148979
2021-01-04 00:00:00+00:00,31971.914062,1040.233032,154.807327,2.485097,3.590243,40.926353
2021-01-05 00:00:00+00:00,33992.429688,1100.006104,158.594772,2.157217,4.237412,41.7346
2021-01-06 00:00:00+00:00,36824.363281,1207.112183,169.016922,1.929217,4.620357,42.165955
2021-01-07 00:00:00+00:00,39371.042969,1225.678101,169.615952,2.372745,4.476613,43.44949
2021-01-08 00:00:00+00:00,40797.609375,1224.197144,173.279877,3.219887,5.79711,42.395317
2021-01-09 00:00:00+00:00,40254.546875,1281.077271,177.483932,3.385382,7.218531,43.932854
2021-01-10 00:00:00+00:00,38356.441406,1262.246704,171.114838,3.466111,6.586496,42.448475


In [5]:
# Convert Date index to datetime (if not already)
close_prices_df.index = pd.to_datetime(close_prices_df.index)

# Split data into training (2021-2022) and testing (2022-2023) periods
train_data = close_prices_df['2021-01-01':'2022-12-31']
test_data = close_prices_df['2023-01-01':'2023-12-31']

# Set the frequency explicitly (e.g., 'D' for daily, 'M' for monthly)
train_data = train_data.asfreq('D')
test_data = test_data.asfreq('D')

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

Train shape: (730, 6)
Test shape: (364, 6)


In [6]:
# Create combinations from model parameters
p_values = [1, 5]
q_values = [0, 1, 5]
combinations = list(itertools.product(p_values, q_values))
print(f"Number of combinations: {len(combinations)}")
print(combinations)

Number of combinations: 6
[(1, 0), (1, 1), (1, 5), (5, 0), (5, 1), (5, 5)]


In [7]:
# Store scenario errors
error_list = list()

for p, q in combinations:

    # Fit the VAR model on the training data
    model = VAR(train_data)
    fitted_model = model.fit(maxlags=p, ic='aic')  # Fit the model using AIC to select optimal lags

    # Make predictions for the test data
    # Use the most recent data to generate future forecasts
    forecast = fitted_model.forecast(
        # fitted_model.k_ar : attribute of the VAR model representing the number of lags the model has been trained on
        train_data.values[-fitted_model.k_ar:],  
        steps=len(test_data), # number of future time steps for which the VAR will generate forecasts
    )
    forecast_df = pd.DataFrame(forecast, index=test_data.index, columns=test_data.columns)

    # Evaluate test performance
    mse = mean_squared_error(forecast_df, test_data)
    mape = mean_absolute_percentage_error(forecast_df, test_data)

    # Store the error and the corresponding p, q values
    error_list.append({
        'p': p,
        'q': q,
        'MSE': mse,
        'MAPE': mape,
    })

scenario_df = pd.DataFrame(error_list)

In [8]:
scenario_df.sort_values(by="MAPE", ascending=True)

Unnamed: 0,p,q,MSE,MAPE
3,5,0,4069594.0,1.190033
4,5,1,4069594.0,1.190033
5,5,5,4069594.0,1.190033
0,1,0,3333650.0,2.731085
1,1,1,3333650.0,2.731085
2,1,5,3333650.0,2.731085


**Discussion:** The results from the VAR model evaluation indicate that it struggles to effectively capture the complex dynamics of the cryptocurrency trading time series. Despite varying p and q parameters, the model consistently produces high errors, as reflected in both MSE and MAPE metrics. This suggests that the linear assumptions underlying the VAR model are insufficient for handling cryptocurrency data's inherent volatility, nonlinearity, and noise. Alternative modeling approaches like nonlinear methods like LSTM or GRU may better capture the intricate patterns and dependencies in such time series.

# END