In [1]:
import sys
from pathlib import Path
p = Path('.').absolute().parent
if sys.path[-1] != str(p):
    sys.path.append(str(p))

from graph_irl.buffer_v2 import GraphBuffer
from graph_irl.policy import GaussPolicy, TwoStageGaussPolicy, GCN, Qfunc
from graph_irl.graph_rl_utils import GraphEnv
from graph_irl.sac import SACAgentGraph, TEST_OUTPUTS_PATH
from graph_irl.reward import GraphReward

import numpy as np
import torch
torch.manual_seed(0)
np.random.seed(0)


n_nodes = 10
nodes = torch.randn((n_nodes, 5))
encoder_hiddens = [3, 7]

encoder = GCN(5, encoder_hiddens, with_layer_norm=True, final_tanh=False)
encoderq1 = GCN(5, encoder_hiddens, with_layer_norm=True, final_tanh=False)
encoderq2 = GCN(5, encoder_hiddens, with_layer_norm=True, final_tanh=False)
encoderq1t = GCN(5, encoder_hiddens, with_layer_norm=True, final_tanh=False)
encoderq2t = GCN(5, encoder_hiddens, with_layer_norm=True, final_tanh=False)
encoder_reward = GCN(5, encoder_hiddens, with_layer_norm=True, final_tanh=False)
reward_fn = GraphReward(encoder_reward, encoder_hiddens[-1], [23, 23], with_layer_norm=True)

gauss_policy_kwargs = dict(
    obs_dim=encoder_hiddens[-1],
    action_dim=encoder_hiddens[-1],
    hiddens=[13, 13],
    with_layer_norm=False,
    encoder=encoder,
    two_action_vectors=True,
)

tsg_policy_kwargs = dict(
    obs_dim=encoder_hiddens[-1],
    action_dim=encoder_hiddens[-1],
    hiddens1=[13, 13],
    hiddens2=[11, 11],
    encoder=encoder,
    with_layer_norm=True,
)

qfunc_kwargs = dict(
    obs_action_dim=encoder_hiddens[-1] * 3, 
    hiddens=[17, 17], 
    with_layer_norm=True, 
    encoder=None
)

Q1_kwargs = qfunc_kwargs.copy()
Q1_kwargs['encoder'] = encoderq1
Q2_kwargs = qfunc_kwargs.copy()
Q2_kwargs['encoder'] = encoderq2
Q1t_kwargs = qfunc_kwargs.copy()
Q1t_kwargs['encoder'] = encoderq1t
Q2t_kwargs = qfunc_kwargs.copy()
Q2t_kwargs['encoder'] = encoderq2t

agent_kwargs=dict(
    name='SACAgentGraph',
    policy_constructor=GaussPolicy,
    qfunc_constructor=Qfunc,
    env_constructor=GraphEnv,
    buffer_constructor=GraphBuffer,
    optimiser_constructors=dict(
        policy_optim=torch.optim.Adam,
        temperature_optim=torch.optim.Adam,
        Q1_optim=torch.optim.Adam,
        Q2_optim=torch.optim.Adam,
    ),
    entropy_lb=encoder_hiddens[-1],
    policy_lr=1e-3,
    temperature_lr=1e-3,
    qfunc_lr=1e-3,
    tau=0.005,
    discount=1.,
    save_to=TEST_OUTPUTS_PATH,
    cache_best_policy=False,
    clip_grads=False,
    UT_trick=False,
    with_entropy=False,
)

config = dict(
    training_kwargs=dict(
        seed=0,
        num_iters=50,
        num_steps_to_sample=200,
        num_grad_steps=1,
        batch_size=100,
        num_eval_steps_to_sample=n_nodes,
        min_steps_to_presample=100,
    ),
    Q1_kwargs=Q1_kwargs,
    Q2_kwargs=Q2_kwargs,
    Q1t_kwargs=Q1t_kwargs,
    Q2t_kwargs=Q2t_kwargs,
    policy_kwargs=gauss_policy_kwargs,
    buffer_kwargs=dict(
        max_size=10_000,
        nodes=nodes,
        seed=0,
        drop_repeats_or_self_loops=True,
        get_batch_reward=True,
        graphs_per_batch=100,
        action_is_index=True,
        per_decision_imp_sample=False,
    ),
    env_kwargs=dict(
        x=nodes,
        reward_fn=reward_fn,
        max_episode_steps=n_nodes,
        num_expert_steps=n_nodes,
        max_repeats=n_nodes // 3,
        max_self_loops=n_nodes // 3,
        drop_repeats_or_self_loops=True,
        id=None,
        reward_fn_termination=False,
        calculate_reward=False,
    )
)

agent = SACAgentGraph(
    **agent_kwargs,
    **config
)

# print(agent.buffer.idx, agent.buffer.reward_idx)
# agent.buffer.collect_path(agent.env, agent, agent.num_steps_to_sample)
# print(agent.buffer.idx, agent.buffer.reward_idx, 
#       np.mean(agent.buffer.path_lens), 
#       np.max(agent.buffer.path_lens), 
#       agent.buffer.path_lens,
#       agent.buffer.reward_t[:agent.buffer.reward_idx],
#       sep='\n')

pygame 2.5.0 (SDL 2.28.0, Python 3.8.17)
Hello from the pygame community. https://www.pygame.org/contribute.html
/home/mario/coding/urban-nets-style-transfer/tests


In [2]:
agent.buffer.get_single_ep_rewards_and_weights(agent.env, agent)

(tensor(-6.6551, grad_fn=<AddBackward0>), tensor(3.0458e+19), 0, 5)

In [3]:
agent.buffer.idx, agent.buffer.reward_idx

(0, 0)

In [4]:
from graph_irl.irl_trainer import IRLGraphTrainer

a = [0] + torch.repeat_interleave(torch.arange(1, n_nodes), 2).tolist() + [0]

expert_edge_index = torch.tensor([
    a,
    ((torch.tensor([1, -1] * (len(a) // 2)) + torch.tensor(a)) % n_nodes).tolist()
], dtype=torch.long)

irl_trainer = IRLGraphTrainer(
    reward_fn=reward_fn,
    reward_optim=torch.optim.Adam(reward_fn.parameters(), lr=1e-3),
    agent=agent,
    nodes=nodes,
    expert_edge_index=expert_edge_index,
    num_expert_traj=5, 
    num_generated_traj=5,
    num_graphs_in_batch=config['buffer_kwargs']['graphs_per_batch'], 
    reward_optim_lr_scheduler=None,
    reward_grad_clip=False,
    per_decision_imp_sample=config['buffer_kwargs']['per_decision_imp_sample'],
    match_expert_step_count=True,
    add_expert_to_generated=False,
)

In [5]:
irl_trainer.train_policy_k_epochs(k=1)

100%|██████████| 50/50 [00:27<00:00,  1.83it/s]
100%|██████████| 1/1 [00:27<00:00, 27.46s/it]


In [5]:
irl_trainer.get_avg_generated_returns()

tensor(-11.5431, grad_fn=<SumBackward0>)