In [111]:
import pandas as pd
from ml import preprocess
from importlib import reload
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

reload(preprocess)

<module 'ml.preprocess' from '/Users/noahforougi/research/portfolio_optimization/ml/preprocess.py'>

Create monthly dataset

In [112]:
# Preprocess data and construct monthly dataset
returns = preprocess.create_returns_df("../data/prices.csv")
df = preprocess.calculate_monthly_features(returns)
df = preprocess.calculate_momentum_features(df)
df = preprocess.calculate_moving_average_features(df)
df["date"] = pd.to_datetime(
    df["month"].dt.to_timestamp(), format="%Y-%m"
) + pd.offsets.MonthEnd(0)
df.drop(columns=["month"], inplace=True)
df = preprocess.z_score_by_period(df)
df["forward_return_1m"] = df.groupby("ticker")["monthly_return"].shift(-1)
df.set_index("date", inplace=True)

1. Create Individual RF models 

In [None]:
def train_random_forest(train_df):
    """Train a random forest regressor on the training data."""
    X_train = train_df.drop(columns=["forward_return_1m", "ticker"])
    y_train = train_df["forward_return_1m"]
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    return rf


def predict_with_model(model, test_df):
    """Predict with the trained model."""
    X_test = test_df.drop(columns=["forward_return_1m", "ticker"])
    return model.predict(X_test)


def split_data(df, ticker, start_date, end_date):
    """Split the dataframe into train and test based on date range for a specific ticker."""
    ticker_df = df[df["ticker"] == ticker]
    train_df = ticker_df[(ticker_df.index >= start_date) & (ticker_df.index < end_date)]
    test_df = ticker_df[ticker_df.index >= end_date]
    return train_df, test_df


def run_individual_models(df):
    """Run individual models for each stock to predict expected returns."""
    current_date = df.index.min()
    initial_end_date = current_date + pd.DateOffset(years=5)
    prediction_end_date = initial_end_date + pd.DateOffset(years=2)
    tickers = df["ticker"].unique()

    all_predictions = []

    for ticker in tqdm(tickers):
        # Initial training and predictions for each ticker
        train_df, test_df = split_data(df, ticker, current_date, initial_end_date)
        if len(train_df) > 0:
            rf_model = train_random_forest(train_df)

            # Monthly predictions for the next two years
            for month_end in tqdm(
                pd.date_range(
                    start=initial_end_date, end=prediction_end_date, freq="ME"
                )
            ):
                monthly_test_df = test_df[
                    (test_df.index > initial_end_date) & (test_df.index <= month_end)
                ].copy()
                if not monthly_test_df.empty:
                    monthly_test_df.loc[:, "predicted_return"] = predict_with_model(
                        rf_model, monthly_test_df
                    )
                    all_predictions.append(monthly_test_df)

    all_predictions_df = pd.concat(all_predictions)
    return all_predictions_df


# Run the individual models and get predictions
all_predictions_df = (
    run_individual_models(df).sort_values(["date", "ticker"]).drop_duplicates()
)

In [None]:
historical_avg_predictions = (
    df.groupby("ticker", as_index=False)["monthly_return"]
    .rolling(window=24, min_periods=1)
    .mean()
    .rename(columns={"monthly_return": "historical_avg_prediction"})
)
all_predictions_df = all_predictions_df.merge(
    historical_avg_predictions, on=["date", "ticker"], how="inner"
)

In [None]:
msfe_rf = mean_squared_error(
    all_predictions_df["forward_return_1m"], all_predictions_df["predicted_return"]
)
print(f"MSFE for Random Forest predictions: {msfe_rf}")

msfe_rf = mean_squared_error(
    all_predictions_df["forward_return_1m"],
    all_predictions_df["historical_avg_prediction"],
)
print(f"MSFE for Historical Avg predictions: {msfe_rf}")

To Do 
- Create model during training period Y1/Y2
- Run model from Y3/Y4 to predict 1M forward returns
- Calculate MSFE for each month for Random Forest Regression 
- Calculate historical average over 24M lookback period 
- Calculate MSFE for eahc month using historical average over 24M lookback period 

RF Model Improvements 
- Z-score for feature construction and for returns? 
- Read literature on what features should be included 
- Add in volume data
- Hyperparameter tuning 
- Expand dataset assets
- Fundamental data
- Test out version of training individual models 
