In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

import os, sys

def get_dir_n_levels_up(path, n):
    # Go up n levels from the given path
    for _ in range(n):
        path = os.path.dirname(path)
    return path


proj_root = get_dir_n_levels_up(os.path.abspath("__file__"), 2)
sys.path.append(proj_root)

from pathlib import Path
import traceback
from typing import Dict
import logging

from liftoff import parse_opts

from opinion_dqn import AgentDQN
from utils import my_logging
from utils.experiment import seed_everything, create_path_to_experiment_folder, build_environment
from utils.generic import convert_namespace_to_dict
import yaml

In [3]:
# softmax in last layer because it scales with N instead of 2^N where N is the nr of agents

# might need to do policy gradient

# next step: Q iteration with action representation

In [4]:
train_env = build_environment()
train_env.action_space.shape[0]

4

In [5]:
experiment_yaml = "2025Feb03-102156_configs"
yaml_path = Path(
    r"D:\Work\repos\RL\phd-rl-algos\dqn\opinion_dynamics\experiments\results"
) / experiment_yaml / "0000_estimator.args_.lin_hidden_out_size_32" / "0" / "cfg.yaml"


with open(yaml_path, 'r') as file:
    config = yaml.safe_load(file)
seed = int(os.path.basename(config["out_dir"]))

seed_everything(seed)

logs_file = os.path.join(config["out_dir"], "experiment_log.log")

logger = my_logging.setup_logger(
    name=config["experiment"],
    # log_file=logs_file,
    level=logging.INFO,
)

logger.info(f"Starting experiment: {config['full_title']}")

### Setup environments ###
train_env = build_environment()
validation_env = build_environment()

### Setup output and loading paths ###

path_previous_experiments_outputs = None
if "restart_training_timestamp" in config:
    path_previous_experiments_outputs = create_path_to_experiment_folder(
        config,
        config["out_dir"],
        config["restart_training_timestamp"],
    )

experiment_agent = AgentDQN(
    train_env=train_env,
    validation_env=validation_env,
    experiment_output_folder=config["out_dir"],
    experiment_name=config["experiment"],
    resume_training_path=path_previous_experiments_outputs,
    save_checkpoints=True,
    logger=logger,
    config=config
)

logger.info(
    f'Initialized agent with models: {experiment_agent.policy_model}'
)

experiment_agent.train(train_epochs=config["epochs_to_train"])

logger.info(
    f'Finished training experiment: {config["full_title"]}, run_id: {config["run_id"]}'
)

my_logging.cleanup_file_handlers(experiment_logger=logger)


2025-02-08 13:46:05,769 - opinion_agent_dqn - INFO - 1719336276.py:21 - Starting experiment: 2025Feb03-102156_configs_estimator.args_.lin_hidden_out_size=32
2025-02-08 13:46:05,771 - opinion_agent_dqn - INFO - opinion_dqn.py:222 - Loaded configuration settings.
2025-02-08 13:46:06,616 - opinion_agent_dqn - INFO - opinion_dqn.py:281 - Initialized newtworks and optimizer.
2025-02-08 13:46:06,617 - opinion_agent_dqn - INFO - 1719336276.py:48 - Initialized agent with models: OpinionNet(
  (fc): Sequential(
    (0): Linear(in_features=4, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=32, bias=True)
    (3): ReLU()
  )
  (predict_A_b_c): Linear(in_features=32, out_features=18, bias=True)
)
2025-02-08 13:46:06,617 - opinion_agent_dqn - INFO - opinion_dqn.py:464 - Starting training session at: 0
2025-02-08 13:46:06,617 - opinion_agent_dqn - INFO - opinion_dqn.py:505 - Starting training epoch at t = 0
2025-02-08 13:48:32,834 - opinion_agent_dqn - INFO -

  torch.tensor(s, dtype=torch.float32), epsilon=self.validation_epsilon


2025-02-08 13:49:17,411 - opinion_agent_dqn - INFO - opinion_dqn.py:822 - VALIDATION STATS | Max reward: -23.19952324072193 | Avg reward: -23.203239310665236 | Avg frames (episode): 38.0 | Avg max Q: -0.6860038215977795 | Validation epoch time: 0:00:44.557295
2025-02-08 13:49:17,411 - opinion_agent_dqn - INFO - opinion_dqn.py:310 - Saving checkpoint at t = 200000 ...
2025-02-08 13:49:18,248 - opinion_agent_dqn - INFO - opinion_dqn.py:314 - Checkpoint saved at t = 200000
2025-02-08 13:49:18,249 - opinion_agent_dqn - INFO - opinion_dqn.py:490 - Epoch 0 completed in 0:03:11.632195
2025-02-08 13:49:18,250 - opinion_agent_dqn - INFO - opinion_dqn.py:491 - 

2025-02-08 13:49:18,250 - opinion_agent_dqn - INFO - opinion_dqn.py:505 - Starting training epoch at t = 200000
2025-02-08 13:51:52,651 - opinion_agent_dqn - INFO - opinion_dqn.py:642 - TRAINING STATS | Frames seen: 400000 | Episode: 11053 | Max reward: -21.696735465068684 | Avg reward: -22.693724133273868 | Avg frames (episode): 36.7455

In [6]:
# import torch 

# nr_betas = 2
# nr_agents = 3
# batch_size = 2

# # Create a sample A_b_c_net matrix with unique integer values for visualization
# A_b_c_net = torch.tensor([
#     [[10, 11, 12, 13, 14, 15, 16], 
#      [20, 21, 22, 23, 24, 25, 26]],  # Batch 1
#     [[30, 31, 32, 33, 34, 35, 36], 
#      [40, 41, 42, 43, 44, 45, 46]]   # Batch 2
# ], dtype=torch.float32)

# # Extract components
# c = A_b_c_net[:, :, 0]  # Free term (first column)
# A_diag = torch.exp(A_b_c_net[:, :, 1 : nr_agents + 1])  # Positive definite diagonal (next `nr_agents` columns)
# b = A_b_c_net[:, :, nr_agents + 1 :]  # Bias term (remaining columns)

# # Print extracted values for visualization
# print(nr_betas * (2 * nr_agents + 1))
# print("A_b_c_net:\n", A_b_c_net)
# print("\nExtracted c (free term):\n", c)
# print("\nExtracted A_diag (exponentiated for positivity):\n", A_diag)
# print("\nExtracted b (bias term):\n", b)