In [1]:
import sys
import os

# Ensure Python finds config.py
sys.path.append(os.path.abspath(".."))  # Moves up one level to find config.py

In [2]:
import os
import gym
import numpy as np
import pandas as pd
import pickle
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback

# --- Import local config ---
from config import MODELS_DIR, PROCESSED_DATA_DIR, LOGS_DIR

# Define the CentralBankEnv environment
class CentralBankEnv(gym.Env):
    def __init__(self, varmax_model_path, train_data_path, optimal_lag, episode_length=60):
        super().__init__()

        with open(varmax_model_path, "rb") as f:
            self.varmax_model = pickle.load(f)

        self.train_data = pd.read_csv(train_data_path, index_col="date", parse_dates=True)

        self.exogenous_vars = ["FEDFUNDS"]
        self.endogenous_vars = [col for col in self.train_data.columns if col not in self.exogenous_vars]

        if len(self.endogenous_vars) < 1:
            raise ValueError("Dataset must contain at least one endogenous variable!")

        self.optimal_lag = optimal_lag
        self.episode_length = episode_length

        num_features = len(self.endogenous_vars) + len(self.exogenous_vars)
        self.expected_shape = (self.optimal_lag, num_features)

        self.action_mapping = np.array([-0.50, -0.25, 0, 0.25, 0.50])
        self.action_space = spaces.Discrete(len(self.action_mapping))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self.expected_shape, dtype=np.float32)

        self.ffr_min, self.ffr_max = self.train_data["FEDFUNDS"].min(), self.train_data["FEDFUNDS"].max()

        if "CPIAUCSL" in self.endogenous_vars:
            self.inflation_idx = self.endogenous_vars.index("CPIAUCSL")
        else:
            raise KeyError("CPICPIAUCSLnot found in dataset!")

        self.steps = 0

        # Reward logging
        self.episode_rewards = []  # List to store total rewards per episode
        self.current_episode_rewards = 0

    def reset(self):
        """Resets the environment and logs the previous episode rewards."""
        # Log previous episode if not the very first call to reset
        if self.steps > 0:
            self.episode_rewards.append(self.current_episode_rewards)
            # Note: The callback below will also print these, so you can remove this print if you prefer.
            print(f"Episode {len(self.episode_rewards)} completed. Total Reward: {self.current_episode_rewards:.3f}")

        self.steps = 0
        self.current_episode_rewards = 0

        max_start_idx = len(self.train_data) - self.optimal_lag - 1
        if max_start_idx <= 0:
            raise ValueError(
                f"Not enough train data! Dataset size: {len(self.train_data)}, required minimum: {self.optimal_lag + 1}"
            )

        start_idx = np.random.randint(0, max_start_idx)
        self.history = self.train_data.iloc[start_idx : start_idx + self.optimal_lag][
            self.endogenous_vars + self.exogenous_vars
        ].to_numpy(copy=False)
        self.last_ffr = self.history[-1, -1]

        return self.history

    def step(self, action):
        """Executes an action, updates the state, logs rewards, and returns the new state."""
        ffr_change = self.action_mapping[action]
        new_ffr = np.clip(self.last_ffr + ffr_change, self.ffr_min, self.ffr_max)

        # Forecast one step using the new FFR as the exogenous input
        next_state_df = self.varmax_model.forecast(steps=1, exog=np.array([[new_ffr]]))
        next_state = next_state_df[self.endogenous_vars].iloc[0].values.reshape(1, -1)

        new_row = np.hstack([next_state, np.array([[new_ffr]])])
        self.history = np.vstack([self.history[1:], new_row])
        self.last_ffr = new_ffr

        if self.history.shape != self.expected_shape:
            raise ValueError(
                f"History shape mismatch: Expected {self.expected_shape}, got {self.history.shape}"
            )

        reward = self._reward(next_state.flatten(), action)
        self.current_episode_rewards += reward

        self.steps += 1
        done = self.steps >= self.episode_length

        return self.history, reward, done, {}

    def _reward(self, next_state, action):
        """Computes reward based on macroeconomic stability and policy smoothness."""
        inflation = next_state[self.inflation_idx]
        pi_target = 0.02
        alpha, lamb = 0.5, 0.1
        return - (alpha * (inflation - pi_target) ** 2 + lamb * (action ** 2))

    def get_episode_rewards(self):
        """Returns the list of total rewards for completed episodes."""
        return self.episode_rewards

# Define a custom callback to print episode rewards as training progresses
class EpisodeRewardLoggingCallback(BaseCallback):
    def __init__(self, verbose=1):
        super(EpisodeRewardLoggingCallback, self).__init__(verbose)
        self.last_logged = 0

    def _on_step(self) -> bool:
        # Retrieve episode rewards from the training environment (list per env)
        episode_rewards = self.training_env.get_attr("episode_rewards")
        if episode_rewards and episode_rewards[0]:
            rewards = episode_rewards[0]
            # If there are new episodes completed, print their rewards
            while self.last_logged < len(rewards):
                print(f"Episode {self.last_logged + 1} completed with total reward: {rewards[self.last_logged]:.3f}")
                self.last_logged += 1
        return True





In [5]:
# Build file paths using config values
varmax_model_path = os.path.join(MODELS_DIR, "varmax_model.pkl")
rl_train_data_path = os.path.join(PROCESSED_DATA_DIR, "rl_train_data.csv")
model_path = os.path.join(MODELS_DIR, "ppo_central_bank")
reward_log_path = os.path.join(LOGS_DIR, "rl_rewards.csv")

def make_env():
    return CentralBankEnv(
        varmax_model_path=varmax_model_path,
        train_data_path=rl_train_data_path,
        optimal_lag=4,
        episode_length=60
    )

# Wrap the environment in DummyVecEnv 
env = DummyVecEnv([make_env])

# Create the callback instance for logging episode rewards
episode_reward_callback = EpisodeRewardLoggingCallback(verbose=1)

# Define the RL model (PPO) and start training with the callback
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=os.path.join(LOGS_DIR, "ppo_central_bank"))
model.learn(total_timesteps=100_000, callback=episode_reward_callback)

# Save the trained model
os.makedirs(os.path.dirname(model_path), exist_ok=True)
model.save(model_path)
print(f"RL Model saved at: {model_path}")

# Save the logged episode rewards to a CSV file
env_rewards = env.get_attr("episode_rewards")[0]  # get_attr returns a list per env
pd.DataFrame(env_rewards, columns=["Total Reward"]).to_csv(reward_log_path, index=False)
print(f"Episode rewards logged at: {reward_log_path}")


Using cpu device
Logging to /Users/nim/Documents/Research/central_bank_rl/logs/ppo_central_bank/PPO_4




Episode 1 completed. Total Reward: -49.055
Episode 1 completed with total reward: -49.055
Episode 2 completed. Total Reward: -56.608
Episode 2 completed with total reward: -56.608
Episode 3 completed. Total Reward: -51.237
Episode 3 completed with total reward: -51.237
Episode 4 completed. Total Reward: -44.141
Episode 4 completed with total reward: -44.141
Episode 5 completed. Total Reward: -53.230
Episode 5 completed with total reward: -53.230
Episode 6 completed. Total Reward: -56.099
Episode 6 completed with total reward: -56.099
Episode 7 completed. Total Reward: -55.834
Episode 7 completed with total reward: -55.834
Episode 8 completed. Total Reward: -49.855
Episode 8 completed with total reward: -49.855
Episode 9 completed. Total Reward: -55.698
Episode 9 completed with total reward: -55.698
Episode 10 completed. Total Reward: -54.038
Episode 10 completed with total reward: -54.038
Episode 11 completed. Total Reward: -48.540
Episode 11 completed with total reward: -48.540
Episod