In [24]:
import pandas as pd
from ml import predict_expected_returns, preprocess, train_expected_returns
from importlib import reload
import config
from pypfopt import risk_models
from tqdm import tqdm
import os
import joblib
import utils
from rpy2.robjects import pandas2ri, r
from tqdm import tqdm
import numpy as np

reload(preprocess)
reload(train_expected_returns)
reload(predict_expected_returns)

<module 'ml.predict_expected_returns' from '/home/sagemaker-user/portfolio_optimization/ml/predict_expected_returns.py'>

In [27]:
def forecast_ra_cov(data_window, days_in_month):
    return risk_models.sample_cov(data_window, returns_data=True) / 252 * days_in_month


def forecast_shrinkage_cov(data_window, days_in_month):
    return (
        risk_models.CovarianceShrinkage(data_window, returns_data=True).ledoit_wolf()
        / 252
    ) * days_in_month


def forecast_ewma_cov(data_window, days_in_month, span=180):
    return (
        risk_models.exp_cov(data_window, returns_data=True, span=span) / 252
    ) * days_in_month


pandas2ri.activate()


r_code_dcc_garch = """
library(rmgarch)
library(dplyr)

forecast_dcc_garch_cov <- function(data_window, days_in_month) {
    # Define the GARCH specification
    spec <- ugarchspec(
      variance.model = list(model = 'sGARCH', garchOrder = c(1, 1)),
      mean.model = list(armaOrder = c(0, 0))
    )

    # Create the multivariate GARCH specification
    num_columns <- ncol(data_window) - 1
    uspec <- multispec(replicate(num_columns, spec))
    dcc_spec <- dccspec(uspec, dccOrder = c(1, 1), distribution = 'mvnorm')

    # Fit the DCC-GARCH model
    dcc_fit <- dccfit(dcc_spec, data = data_window %>% select(-date))

    if (inherits(dcc_fit, 'uGARCHmultifit')) {
        # Handle non-convergence
        warning('DCC-GARCH fit did not converge. Returning NULL.')
        return(NULL)
    }

    # Forecast the DCC-GARCH model for days_in_month days
    n_ahead <- days_in_month
    dcc_forecast <- dccforecast(dcc_fit, n.ahead = n_ahead)
    dcc_cov_matrix <- rcov(dcc_forecast)[[1]]
    dcc_cov_matrix <- apply(dcc_cov_matrix, c(1, 2), sum)

    return(dcc_cov_matrix)
}
"""

r_code_go_garch = """
library(rmgarch)
library(dplyr)

forecast_go_garch_cov <- function(data_window, days_in_month) {
    spec <- ugarchspec(
        variance.model = list(model = 'sGARCH', garchOrder = c(1, 1)),
        mean.model = list(armaOrder = c(0, 0))
    )

    # Create multispec for GO-GARCH
    num_columns <- ncol(data_window) - 1
    uspec <- multispec(replicate(num_columns, spec))
    # Specify the GO-GARCH model
    garch_spec <- gogarchspec(mean.model = 'constant',
                      variance.model = 'goGARCH',
                      distribution.model = 'mvnorm',
                      umodel = uspec)    
    # Fit the GO-GARCH model
    fit <- gogarchfit(spec = garch_spec, data = data_window %>% select(-date))

    if (inherits(fit, 'uGARCHmultifit')) {
        # Handle non-convergence
        warning('GO-GARCH fit did not converge. Returning NULL.')
        return(NULL)
    }

    # Forecast the GO-GARCH model
    n_ahead <- days_in_month
    gogarch_forecast <- gogarchforecast(fit, n.ahead = n_ahead)
    gogarch_cov_matrix <- rcov(gogarch_forecast)[[1]]
    gogarch_cov_matrix <- apply(gogarch_cov_matrix, c(1, 2), sum)

    return(gogarch_cov_matrix)
}
"""
# Execute the R code to define the functions in R environment
r(r_code_dcc_garch)
r(r_code_go_garch)


def forecast_dcc_garch_cov(data_window, days_in_month):
    # Convert the pandas DataFrame to R DataFrame
    r_data_window = pandas2ri.py2rpy(data_window.reset_index())
    dcc_cov_matrix = r["forecast_dcc_garch_cov"](r_data_window, days_in_month)
    if dcc_cov_matrix is None:
        return np.full(
            (data_window.shape[1], data_window.shape[1]), np.nan
        )  # Return NaN matrix if not converged
    return np.array(dcc_cov_matrix)


def forecast_go_garch_cov(data_window, days_in_month):
    # Convert the pandas DataFrame to R DataFrame
    r_data_window = pandas2ri.py2rpy(data_window.reset_index())
    go_garch_cov_matrix = r["forecast_go_garch_cov"](r_data_window, days_in_month)
    if go_garch_cov_matrix is None:
        return np.full(
            (data_window.shape[1], data_window.shape[1]), np.nan
        )  # Return NaN matrix if not converged
    return np.array(go_garch_cov_matrix)

In [28]:
def _forecast_cov(df, current_date):
    # Period to predict
    next_date = current_date + pd.offsets.DateOffset(months=1)
    next_month = next_date.month
    next_year = next_date.year

    # True value
    true_df = (
        df.loc[(df.index.month == next_month) & (df.index.year == next_year)].astype(
            float
        )
        / 100
    )
    days_in_next_month = len(true_df)
    true_cov = (
        risk_models.sample_cov(true_df, returns_data=True) / 252 * days_in_next_month
    )

    # This gets the data for the lookback period
    start_date_window = current_date - pd.DateOffset(months=120)
    window_df = (
        df.loc[(df.index >= start_date_window) & (df.index <= current_date)].astype(
            float
        )
        / 100
    )

    # Get the forecast for next month
    cov_ra = forecast_ra_cov(window_df, days_in_next_month)
    cov_lw_shrinkage = forecast_shrinkage_cov(window_df, days_in_next_month)
    cov_ewma = forecast_ewma_cov(window_df, days_in_next_month)
    cov_dcc = forecast_dcc_garch_cov(window_df, days_in_next_month)
    cov_gogarch = forecast_go_garch_cov(window_df, days_in_next_month)

    return {
        "date": current_date,
        "true_cov": true_cov,
        "cov_ra": cov_ra,
        "cov_lw_shrinkage": cov_lw_shrinkage,
        "cov_ewma": cov_ewma,
        "cov_dcc": cov_dcc,
        "cov_gogarch": cov_gogarch,
    }


def forecast_cov(df):
    results = []
    df = df.set_index("date")
    df.index = pd.to_datetime(df.index)

    # Get list of dates to use
    df_filtered = df.loc[config.START_DATE : config.END_DATE]
    dates = (
        df_filtered.groupby([df_filtered.index.year, df_filtered.index.month])
        .apply(lambda x: x.index.max())
        .reset_index(drop=True)
        .tolist()
    )

    # Iterate over the dates
    for current_date in tqdm(dates):
        result = _forecast_cov(df, current_date)
        results.append(result)

    return results

In [34]:
df[pd.to_datetime(df.date) ==]

Unnamed: 0,date,lo_prior,prior_2,prior_3,prior_4,prior_5,prior_6,prior_7,prior_8,prior_9,hi_prior
0,1926-11-03,-0.12,0.60,-0.09,0.30,-0.51,-0.22,-0.12,0.50,0.13,1.28
1,1926-11-04,0.65,1.82,1.34,0.61,1.01,0.64,0.82,0.44,0.48,0.40
2,1926-11-05,-0.84,-0.77,-0.22,-0.15,-0.02,-0.02,-0.07,0.36,0.20,0.08
3,1926-11-06,1.03,0.28,0.24,0.40,0.19,0.64,0.10,0.10,0.39,-0.68
4,1926-11-08,-0.06,0.11,1.78,0.28,0.36,0.23,0.30,1.17,0.58,-0.18
...,...,...,...,...,...,...,...,...,...,...,...
25648,2024-05-24,0.83,0.58,0.94,0.06,0.18,0.35,0.46,1.03,0.54,2.02
25649,2024-05-28,-0.38,-0.94,-0.45,-0.78,-0.48,-0.49,-0.67,-0.40,0.19,2.51
25650,2024-05-29,-1.86,-1.44,-0.37,-1.52,-1.02,-0.48,-0.95,-0.89,-0.88,-0.15
25651,2024-05-30,1.12,0.91,0.55,-0.07,0.11,-1.24,-1.04,-0.09,-0.68,-1.83


In [38]:
files = utils.list_s3_files(prefix="clean/", bucket_name=config.BUCKET_NAME)
for f in files:
    df = utils.read_s3_file(f)
    results = forecast_cov(df)
    clean_f = f.replace("clean/", "").replace(".csv", "")
    utils.write_s3_joblib(results, f"output/cov_forecasts_{clean_f}.pkl")

clean/btm.csv btm
clean/industry.csv industry
clean/momentum.csv momentum
clean/size.csv size
clean/size_ltr.csv size_ltr
clean/size_str.csv size_str
clean/sizebtm.csv sizebtm
clean/sizemomentum.csv sizemomentum


In [30]:
df = utils.read_s3_file("clean/momentum.csv")
results = forecast_cov(df)
utils.write_s3_joblib(results, "output/cov_forecasts_momentum.pkl")

 10%|█         | 35/348 [27:22<4:06:26, 47.24s/it]


Non-Converged:
[1]  1  2  3  5  6  7  8  9 10

dccfit-->error: convergence problem in univariate fit...
...returning uGARCHmultifit object instead...check and resubmit...

 12%|█▏        | 42/348 [32:14<3:50:15, 45.15s/it]


Non-Converged:
[1]  1  2  3  5  6  7  8  9 10

dccfit-->error: convergence problem in univariate fit...
...returning uGARCHmultifit object instead...check and resubmit...

100%|██████████| 348/348 [4:05:46<00:00, 42.38s/it]  


In [33]:
utils.write_s3_joblib(results, "output/cov_forecasts_momentum.pkl")

In [32]:
results[0]

{'date': Timestamp('1995-01-31 00:00:00'),
 'true_cov':           lo_prior   prior_2   prior_3   prior_4   prior_5   prior_6  \
 lo_prior  0.000728  0.000608  0.000435  0.000439  0.000506  0.000373   
 prior_2   0.000608  0.000762  0.000491  0.000520  0.000601  0.000480   
 prior_3   0.000435  0.000491  0.000437  0.000391  0.000478  0.000373   
 prior_4   0.000439  0.000520  0.000391  0.000521  0.000556  0.000403   
 prior_5   0.000506  0.000601  0.000478  0.000556  0.000715  0.000527   
 prior_6   0.000373  0.000480  0.000373  0.000403  0.000527  0.000489   
 prior_7   0.000375  0.000439  0.000342  0.000427  0.000500  0.000402   
 prior_8   0.000384  0.000441  0.000325  0.000397  0.000505  0.000383   
 prior_9   0.000327  0.000419  0.000256  0.000339  0.000380  0.000314   
 hi_prior  0.000486  0.000526  0.000367  0.000378  0.000443  0.000357   
 
            prior_7   prior_8   prior_9  hi_prior  
 lo_prior  0.000375  0.000384  0.000327  0.000486  
 prior_2   0.000439  0.000441  0.000

In [None]:
df = utils.read_s3_file("clean/momentum.csv")
df.index = pd.to_datetime(df.index)
# Get list of dates to use
df_filtered = df.loc[config.START_DATE : config.END_DATE]
dates = (
    df_filtered.groupby([df_filtered.index.year, df_filtered.index.month])
    .apply(lambda x: x.index.max())
    .reset_index(drop=True)
    .tolist()
)

In [None]:
file_list = os.listdir("../data/clean/")

for f in tqdm(file_list):
    df = pd.read_csv(f"../data/clean/{f}")
    results = forecast_cov(df)
    clean_f = f.replace(".csv", "")
    joblib.dump(results, f"../data/output/{clean_f}_cov_forecasts.joblib")