# Automated Graph of Thoughts - Simple PPO Approach
As a first approach with Deep Reinforcement Learning (DRL), a simple PPO agent is trained on lists of fixed cardinality.
The goal of this first DRL approach is to verify that a complex Reinforcement Learning agent is able to learn a task for a given cardinality.


## Provide Required Components with Parameters
Factory function for the required components are provided.
The experiment is employed with the following parameters:
- maximum graph depth: $8$
- maximum graph breadth: $4$
- divergence cutoff factor: $0.5$

The model is trained solely on lists of cardinality $16$.
The complexity equals the list cardinality.


In [1]:
from auto_graph_of_thoughts.env import GraphObservationComponent, GraphStepRewardVersion
from auto_graph_of_thoughts.experiment import ExperimentConfiguration, LanguageModelSimulationType, Experiment
from auto_graph_of_thoughts.tasks.sum_list import sum_list_task

SEED = 0

ENABLE_TRAINING = True
ENABLE_EVALUATION = True

N_VEC_ENVS = 8

COMPLEXITIES = [16]

EVAL_N_EPISODES = 100

config = ExperimentConfiguration(
        seed=SEED,
        task=sum_list_task,
        max_steps=20,
        observation_filter={
            GraphObservationComponent.DEPTH,
            GraphObservationComponent.BREADTH,
            GraphObservationComponent.COMPLEXITY,
            GraphObservationComponent.PREV_ACTIONS,
            GraphObservationComponent.GRAPH_OPERATIONS,
            GraphObservationComponent.LOCAL_COMPLEXITY,
            GraphObservationComponent.PREV_SCORE
        },
        max_depth=8,
        max_breadth=8,
        divergence_cutoff_factor=0.5,
        train_complexities=COMPLEXITIES,
        eval_complexities=COMPLEXITIES,
        max_complexity=max(COMPLEXITIES),
        max_operations=32,
        lm_simulation_type=LanguageModelSimulationType.REALISTIC,
        reward_version=GraphStepRewardVersion.V5
)
experiment = Experiment(config)

In [2]:
from stable_baselines3.common.utils import set_random_seed
set_random_seed(SEED, True)

## Utilities

In [3]:
from pure_graph_of_thoughts.api.schema import JsonSchemaEncoder
import json
import os
from auto_graph_of_thoughts.experiment.agent_evaluation_summary import AgentEvaluationSummary

results_directory = './artifacts/results/agent_evaluations'

def store_evaluation_summary(evaluation_summary: AgentEvaluationSummary) -> None:
    """
    Stores an evaluation summary to file.
    :param evaluation_summary: evaluation summary to store
    """
    file_name = f'{results_directory}/{evaluation_summary.name}.json'
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    with open(file_name, 'w', encoding='utf-8') as f:
        json.dump(evaluation_summary, f, cls=JsonSchemaEncoder, ensure_ascii=False, indent=2)

def load_evaluation_summary(name: str) -> AgentEvaluationSummary:
    """
    Loads an evaluation summary.
    :param name: name
    :return: loaded evaluation summary
    """
    file_name = f'{results_directory}/{name}.json'
    with open(file_name, 'r', encoding='utf-8') as f:
        return AgentEvaluationSummary.from_dict(json.load(f))


## Train PPO Model
The default PPO model is trained with a vectorized environment (number of environments: `8`).
The number of total time steps is set to $2^{18}$ ($262'144$).

In [4]:
model_ppo_name = 'ppo_r5_64x64_c16_t2xx18_lrfix'

In [5]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

if ENABLE_TRAINING:
    vec_env = make_vec_env(lambda: experiment.create_filtered_train_env(), n_envs=N_VEC_ENVS, seed=SEED)
    model_ppo = PPO('MultiInputPolicy', vec_env, seed=SEED, verbose=1, tensorboard_log='./artifacts/tensorboard')
    model_ppo.learn(total_timesteps=2 ** 18, tb_log_name=model_ppo_name)
    mean_reward, std_reward = evaluate_policy(model_ppo, model_ppo.get_env(), n_eval_episodes=EVAL_N_EPISODES)
    print(f"Mean reward: {mean_reward} +/- {std_reward}")
    model_ppo.save(f'./artifacts/models/{model_ppo_name}')

Using cuda device
Logging to ./artifacts/tensorboard\ppo_r5_64x64_c16_t2xx18_lrfix_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5.37     |
|    ep_rew_mean     | -0.194   |
| time/              |          |
|    fps             | 624      |
|    iterations      | 1        |
|    time_elapsed    | 26       |
|    total_timesteps | 16384    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.69        |
|    ep_rew_mean          | -0.104      |
| time/                   |             |
|    fps                  | 405         |
|    iterations           | 2           |
|    time_elapsed         | 80          |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.021467399 |
|    clip_fraction        | 0.335       |
|    clip_range           | 0.2         |
|    entropy_loss         | -

## Evaluate PPO Model
The trained PPO model is evaluated on $100$ time steps.

### Run Episodes for PPO

In [6]:
unwrapped_env, filtered_env = experiment.created_eval_env_tuple()
model_ppo = PPO.load(f'./artifacts/models/{model_ppo_name}')
obs, info = filtered_env.reset(seed=SEED)
for i in range(100):
    action, _states = model_ppo.predict(obs)
    decoded_action = filtered_env.decode_action(action)
    obs, rewards, terminated, truncated, info = filtered_env.step(action)
    print(
        f'action: {decoded_action.type.name}-{decoded_action.operation.name if decoded_action.operation is not None else None} = {float(rewards)}')
    if terminated or truncated:
        obs, info = filtered_env.reset()
        print(f'Episode end')

action: APPEND_OPERATION-split = 0.0875
action: APPEND_OPERATION-sum = 0.075
action: APPEND_OPERATION-sum = 0.0625
action: APPEND_OPERATION-merge = 0.05
action: APPEND_OPERATION-sum = 0.0375
action: APPEND_OPERATION-sum = 0.025
action: STOP-None = 1.0
Episode end
action: APPEND_OPERATION-split = 0.0875
action: APPEND_OPERATION-sum = 0.075
action: APPEND_OPERATION-sum = 0.0625
action: APPEND_OPERATION-sum = 0.05
action: APPEND_OPERATION-merge = 0.0375
action: APPEND_OPERATION-sum = 0.025
action: APPEND_OPERATION-sum = 0.0125
action: APPEND_OPERATION-sum = 0.0
action: STOP-None = 1.0
Episode end
action: APPEND_OPERATION-split = 0.0875
action: APPEND_OPERATION-sum = 0.075
action: APPEND_OPERATION-sum = 0.0625
action: APPEND_OPERATION-sum = 0.05
action: APPEND_OPERATION-merge = 0.0375
action: APPEND_OPERATION-sum = 0.025
action: APPEND_OPERATION-sum = 0.0125
action: STOP-None = 1.0
Episode end
action: APPEND_OPERATION-split = 0.0875
action: APPEND_OPERATION-sum = 0.075
action: APPEND_OPERA

### Run Agent Evaluation for PPO

In [7]:
from auto_graph_of_thoughts.experiment.evaluate_agent import evaluate_agent

if ENABLE_EVALUATION:
    evaluation_ppo = evaluate_agent(
            experiment,
            model_ppo_name,
            EVAL_N_EPISODES,
            lambda obs: model_ppo.predict(obs)[0]
    )
    store_evaluation_summary(evaluation_ppo.summary)
    
evaluation_ppo_summary = load_evaluation_summary(model_ppo_name)
evaluation_ppo_summary.solved_rate_train_complexities, evaluation_ppo_summary.solved_rate_eval_complexities

(1.0, 1.0)

In [8]:
evaluation_ppo_summary.avg_n_operations_per_complexity

{16: 8.68}

The agent is able to solve the task.