In [1]:
import pandas as pd
import utils, config
from ml import preprocess, train_expected_returns, predict_expected_returns
from importlib import reload

reload(preprocess)
reload(train_expected_returns)
reload(predict_expected_returns)

<module 'ml.predict_expected_returns' from '/home/sagemaker-user/portfolio_optimization/ml/predict_expected_returns.py'>

In [6]:
files = utils.list_s3_files(prefix="clean/", bucket_name=config.BUCKET_NAME)
dates = pd.date_range(config.START_DATE, config.END_DATE, freq="ME")
f = files[0]
df = utils.read_s3_file(f)
df = preprocess.prepare_training_data(df)

# Read in pre-trained model

In [7]:
model = f.replace(".csv", "_er_models.pkl").replace("clean/", "")
from ml.train_expected_returns import ModelType
model_dict_original = utils.read_s3_joblib(f"return_models/{model}")

# Train model

In [3]:
model_dict_w_poly = train_expected_returns.train_expected_returns(df)

  0%|          | 0/28 [00:00<?, ?it/s]INFO:ml.train_expected_returns:Training Model --- Rolling Average
INFO:ml.train_expected_returns:Training Model --- EWMA
INFO:ml.train_expected_returns:Training Model --- ARIMA
INFO:ml.train_expected_returns:Training Model --- Random Forest
INFO:ml.train_expected_returns:Training Model --- XGBoost
INFO:ml.train_expected_returns:Training Model --- Linear Regression
INFO:ml.train_expected_returns:Training Model --- Ridge Regression
INFO:ml.train_expected_returns:Training Model --- Lasso Regression
INFO:ml.train_expected_returns:Training Model --- SVR Model
INFO:ml.train_expected_returns:Training Model --- Gradient Boosting Model
  4%|▎         | 1/28 [02:32<1:08:50, 152.97s/it]INFO:ml.train_expected_returns:Training Model --- Rolling Average
INFO:ml.train_expected_returns:Training Model --- EWMA
INFO:ml.train_expected_returns:Training Model --- ARIMA
INFO:ml.train_expected_returns:Training Model --- Random Forest
INFO:ml.train_expected_returns:Traini

# Train model with standard scaler

In [31]:
df = utils.read_s3_file(f)
df = preprocess.prepare_training_data(df)
model_dict_w_ss = train_expected_returns.train_expected_returns(df)

  0%|          | 0/28 [00:00<?, ?it/s]INFO:ml.train_expected_returns:Training Model --- Rolling Average
INFO:ml.train_expected_returns:Training Model --- EWMA
INFO:ml.train_expected_returns:Training Model --- ARIMA
INFO:ml.train_expected_returns:Training Model --- Random Forest
INFO:ml.train_expected_returns:Training Model --- XGBoost
INFO:ml.train_expected_returns:Training Model --- Linear Regression
INFO:ml.train_expected_returns:Training Model --- Ridge Regression
INFO:ml.train_expected_returns:Training Model --- Lasso Regression
INFO:ml.train_expected_returns:Training Model --- SVR Model
INFO:ml.train_expected_returns:Training Model --- Gradient Boosting Model
  4%|▎         | 1/28 [02:38<1:11:19, 158.51s/it]INFO:ml.train_expected_returns:Training Model --- Rolling Average
INFO:ml.train_expected_returns:Training Model --- EWMA
INFO:ml.train_expected_returns:Training Model --- ARIMA
INFO:ml.train_expected_returns:Training Model --- Random Forest
INFO:ml.train_expected_returns:Traini

# Results from Predictions 

In [8]:
results = predict_expected_returns.expected_return_forecasts(
    df, model_dict=model_dict_original, dates=dates
)
results.columns = [str(x) for x in results.columns]

In [9]:
def calculate_msfe(df):
    # List of model columns
    model_columns = [
        "ModelType.ROLLING_AVERAGE",
        "ModelType.EWMA",
        "ModelType.RANDOM_FOREST",
        "ModelType.XGBOOST",
        "ModelType.LINEAR_REGRESSION",
        "ModelType.RIDGE_REGRESSION",
        "ModelType.LASSO_REGRESSION",
        "ModelType.SVR",
        "ModelType.GRADIENT_BOOSTING",
    ]

    msfe_dict = {}

    for model in model_columns:
        squared_errors = (df[model] - df["true_value"]) ** 2
        msfe = squared_errors.mean()
        msfe_dict[model] = [msfe]

    return msfe_dict

# All Models - Hyperparameter Tuning - LR with Poly 

In [10]:
pd.DataFrame(data=calculate_msfe(results)).T.sort_values(0)

Unnamed: 0,0
ModelType.ROLLING_AVERAGE,0.002634
ModelType.LASSO_REGRESSION,0.002672
ModelType.GRADIENT_BOOSTING,0.002759
ModelType.XGBOOST,0.002766
ModelType.EWMA,0.002856
ModelType.RANDOM_FOREST,0.003001
ModelType.SVR,0.003013
ModelType.RIDGE_REGRESSION,0.005864
ModelType.LINEAR_REGRESSION,0.009099


# All Models - All with Poly - No HP Tuning

In [22]:
pd.DataFrame(data=calculate_msfe(results)).T.sort_values(0)

Unnamed: 0,0
ModelType.ROLLING_AVERAGE,0.002634
ModelType.LASSO_REGRESSION,0.002679
ModelType.EWMA,0.002856
ModelType.RANDOM_FOREST,0.003052
ModelType.XGBOOST,0.003129
ModelType.RIDGE_REGRESSION,0.003137
ModelType.GRADIENT_BOOSTING,0.003189
ModelType.SVR,0.003197
ModelType.LINEAR_REGRESSION,0.009101


In [33]:
pd.DataFrame(data=calculate_msfe(results)).T.sort_values(0)

Unnamed: 0,0
ModelType.ROLLING_AVERAGE,0.002634
ModelType.LASSO_REGRESSION,0.002662
ModelType.EWMA,0.002856
ModelType.SVR,0.002997
ModelType.RANDOM_FOREST,0.003052
ModelType.XGBOOST,0.003132
ModelType.GRADIENT_BOOSTING,0.003171
ModelType.RIDGE_REGRESSION,0.005864
ModelType.LINEAR_REGRESSION,0.009099


In [None]:
import numpy as np