In [1]:
import sys
import os

def get_dir_n_levels_up(path, n):
    # Go up n levels from the given path
    for _ in range(n):
        path = os.path.dirname(path)
    return path

proj_root = get_dir_n_levels_up(os.path.abspath("__file__"), 4)
sys.path.append(proj_root)

import yaml
import torch
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from dqn.opinion_dynamics.opinion_dqn import AgentDQN
from dqn.opinion_dynamics.utils.my_logging import setup_logger
from dqn.opinion_dynamics.utils.experiment import build_environment, process_experiment

from dynamic_programming.opinion_dynamics.common.viz import plot_opinions_over_time, visualize_policy_from_env


def instantiate_agent(exp_subdir_path: str, train_env, validation_env) -> AgentDQN:
    """
    Instantiate an AgentDQN using the configuration stored in a YAML file 
    in the provided experiment subdirectory. The agent is created with the 
    given training and validation environments and loads its previous state.
    
    Args:
        exp_subdir_path (str): Path to the experiment subdirectory containing the config YAML and checkpoint files.
        train_env (gym.Env): The training environment instance.
        validation_env (gym.Env): The validation environment instance.
    
    Returns:
        AgentDQN: An instance of AgentDQN initialized using the experiment configuration and saved state.
    """
    # Assume the YAML configuration is stored as 'config.yaml' in the experiment folder.
    config_path = os.path.join(exp_subdir_path, "cfg.yaml")
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Config file not found at {config_path}")
    
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    
    # Instantiate the agent.
    # The resume_training_path is set to the experiment folder so that the agent loads saved weights/stats.
    agent = AgentDQN(
        train_env=train_env,
        validation_env=validation_env,
        resume_training_path=exp_subdir_path,
        experiment_name=config["experiment"],
        config=config,
        save_checkpoints=False,  # you can set this as needed
        logger=setup_logger("dqn")
    )
    
    return agent

def run_policy_agent(env, agent, max_steps=1000):
    """
    Run the simulation using the agent’s policy (exploitation only).
    
    Args:
        env: The environment (which must have a reset and step method).
        agent: An already-trained AgentDQN instance.
        max_steps: Maximum number of steps to run.
        
    Returns:
        opinions_over_time: Array of opinions (states) over time.
        time_points: Array of time stamps.
        rewards_over_time: Array of rewards collected at each step.
        actions_over_time: Array of actions taken at each step.
    """
    time_points = []
    rewards_over_time = []
    actions_over_time = []  # New: record the actions used.
    opinions_over_time = []
    
    current_time = 0.0
    # Reset environment
    state, _ = env.reset()
    opinions_over_time.append(state.copy())
    
    for step in range(max_steps):
        # Convert state to a batched tensor (batch size = 1)
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        # Use the agent in exploitation mode (epsilon=0, random_action=False)
        # The agent.select_action returns (action, beta_idx, q_value)
        action, _, _, _ = agent.select_action(state_tensor, epsilon=0.0, random_action=False)
        # action is returned as a NumPy array with shape (1, n_agents)
        action = np.squeeze(action)  # Now action has shape (n_agents,)
        actions_over_time.append(action.copy())
        
        # Apply the action in the environment.
        next_state, reward, done, truncated, _ = env.step(action, env.tau)
        opinions_over_time.append(next_state.copy())
        rewards_over_time.append(reward)
        time_points.append(current_time)
        
        current_time += env.tau
        state = next_state
        
        if done or truncated:
            print(f"Simulation ended at step {step}: done={done}, truncated={truncated}")
            break

    return (np.array(opinions_over_time),
            np.array(time_points),
            np.array(rewards_over_time),
            np.array(actions_over_time))

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
EXPERIMENTS_ROOT = r"D:\Work\repos\RL\phd-rl-algos\dqn\opinion_dynamics\experiments\results\2025Apr22-220944_configs"
GAMMA = 0.9

# Step 1: Get metadata (seed, noise, etc.) from your existing util
metadata_df = process_experiment(EXPERIMENTS_ROOT)

# Step 2: Run agents and gather reward trajectories
all_runs = []

for _, row in metadata_df.iterrows():
    subdir = row["sub_experiment_path"]
    noise = float(row.get("sub_exp_cfg_action_w_noise_amplitude", 0))
    seed = row.get("seed", "unknown")

    try:
        env = build_environment()
        agent = instantiate_agent(subdir, env, env)
        opinions, times, rewards, actions = run_policy_agent(env, agent, max_steps=100)

        discounted_return = sum((GAMMA ** t) * r for t, r in enumerate(rewards))

        for t, r in zip(times, rewards):
            all_runs.append({
                "noise_amplitude": noise,
                "seed": seed,
                "time": t,
                "reward": r,
                "discounted_return": discounted_return,
                "experiment": os.path.basename(subdir),
            })

    except Exception as e:
        print(f"❌ Failed to run experiment at {subdir}: {e}")
        continue

# Step 3: Plot
df = pd.DataFrame(all_runs)
df["seed"] = df["seed"].astype(str)
df["noise_amplitude"] = df["noise_amplitude"].astype(str)

g = sns.FacetGrid(df, row="seed", col="noise_amplitude", margin_titles=True, sharey=False)
g.map_dataframe(sns.lineplot, x="time", y="reward")
g.set_axis_labels("Time", "Reward")
g.set_titles(row_template="Seed: {row_name}", col_template="Noise: {col_name}")
plt.tight_layout()
plt.show()

2025-04-23 22:49:07,666 - dqn - INFO - opinion_dqn.py:226 - Loaded configuration settings.


  checkpoint = torch.load(train_stats_file)


2025-04-23 22:49:08,927 - dqn - INFO - opinion_dqn.py:285 - Initialized newtworks and optimizer.


  checkpoint = torch.load(training_stats_file)


2025-04-23 22:49:10,995 - dqn - INFO - opinion_dqn.py:171 - Loaded previous training status from the following files: {'replay_buffer_file': 'D:\\Work\\repos\\RL\\phd-rl-algos\\dqn\\opinion_dynamics\\experiments\\results\\2025Apr22-220944_configs\\0000_agent_params.args_.action_w_noise_amplitude_0\\0\\opinion_agent_dqn_replay_buffer', 'train_stats_file': 'D:\\Work\\repos\\RL\\phd-rl-algos\\dqn\\opinion_dynamics\\experiments\\results\\2025Apr22-220944_configs\\0000_agent_params.args_.action_w_noise_amplitude_0\\0\\opinion_agent_dqn_train_stats', 'checkpoint_model_file': 'D:\\Work\\repos\\RL\\phd-rl-algos\\dqn\\opinion_dynamics\\experiments\\results\\2025Apr22-220944_configs\\0000_agent_params.args_.action_w_noise_amplitude_0\\0\\model_checkpoints\\mck_20'}
Simulation ended at step 99: done=False, truncated=True
2025-04-23 22:49:11,058 - dqn - INFO - opinion_dqn.py:226 - Loaded configuration settings.
2025-04-23 22:49:11,058 - dqn - INFO - opinion_dqn.py:226 - Loaded configuration settin

  checkpoint = torch.load(models_load_file)


2025-04-23 22:49:12,748 - dqn - INFO - opinion_dqn.py:171 - Loaded previous training status from the following files: {'replay_buffer_file': 'D:\\Work\\repos\\RL\\phd-rl-algos\\dqn\\opinion_dynamics\\experiments\\results\\2025Apr22-220944_configs\\0000_agent_params.args_.action_w_noise_amplitude_0\\0\\opinion_agent_dqn_replay_buffer', 'train_stats_file': 'D:\\Work\\repos\\RL\\phd-rl-algos\\dqn\\opinion_dynamics\\experiments\\results\\2025Apr22-220944_configs\\0000_agent_params.args_.action_w_noise_amplitude_0\\0\\opinion_agent_dqn_train_stats', 'checkpoint_model_file': 'D:\\Work\\repos\\RL\\phd-rl-algos\\dqn\\opinion_dynamics\\experiments\\results\\2025Apr22-220944_configs\\0000_agent_params.args_.action_w_noise_amplitude_0\\0\\model_checkpoints\\mck_20'}
2025-04-23 22:49:12,748 - dqn - INFO - opinion_dqn.py:171 - Loaded previous training status from the following files: {'replay_buffer_file': 'D:\\Work\\repos\\RL\\phd-rl-algos\\dqn\\opinion_dynamics\\experiments\\results\\2025Apr22-22