# Automated Graph of Thoughts - A2C Model Experiments

In [1]:
from auto_graph_of_thoughts.env import GraphObservationComponent, GraphStepRewardVersion
from auto_graph_of_thoughts.experiment import ExperimentConfiguration, LanguageModelSimulationType, Experiment
from auto_graph_of_thoughts.tasks.sum_list import sum_list_task

SEED = 0

ENABLE_TRAINING = True
ENABLE_EVALUATION = True

N_VEC_ENVS = 8

TRAIN_COMPLEXITIES = [c for c in range(1, 32 + 1)]
EVAL_COMPLEXITIES = [c for c in range(1, 64 + 1)]

EVAL_N_EPISODES = 100

config = ExperimentConfiguration(
        seed=SEED,
        task=sum_list_task,
        max_steps=20,
        observation_filter={
            GraphObservationComponent.depth,
            GraphObservationComponent.breadth,
            GraphObservationComponent.complexity,
            GraphObservationComponent.prev_actions,
            GraphObservationComponent.graph_operations,
            GraphObservationComponent.local_complexity,
            GraphObservationComponent.prev_score
        },
        max_depth=8,
        max_breadth=8,
        divergence_cutoff_factor=0.5,
        train_complexities=TRAIN_COMPLEXITIES,
        eval_complexities=EVAL_COMPLEXITIES,
        max_complexity=max(EVAL_COMPLEXITIES),
        max_operations=32,
        lm_simulation_type=LanguageModelSimulationType.REALISTIC,
        reward_version=GraphStepRewardVersion.V4
)
experiment = Experiment(config)

## Utilities

In [2]:
from pure_graph_of_thoughts.api.schema import JsonSchemaEncoder
import json
import os
from auto_graph_of_thoughts.experiment.agent_evaluation_summary import AgentEvaluationSummary

results_directory = './artifacts/results/agent_evaluations'

def store_evaluation_summary(evaluation_summary: AgentEvaluationSummary) -> None:
    """
    Stores an evaluation summary to file.
    :param evaluation_summary: evaluation summary to store
    """
    file_name = f'{results_directory}/{evaluation_summary.name}.json'
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    with open(file_name, 'w', encoding='utf-8') as f:
        json.dump(evaluation_summary, f, cls=JsonSchemaEncoder, ensure_ascii=False, indent=2)

def load_evaluation_summary(name: str) -> AgentEvaluationSummary:
    """
    Loads an evaluation summary.
    :param name: name
    :return: loaded evaluation summary
    """
    file_name = f'{results_directory}/{name}.json'
    with open(file_name, 'r', encoding='utf-8') as f:
        return AgentEvaluationSummary.from_dict(json.load(f))

In [3]:
from typing import Mapping
import plotly.express as px
import pandas as pd

COLOR_VALID = '#6acc64'
def visualize_solved_rate(name: str, solved_rate: Mapping[int, float]) -> None:
    df = pd.DataFrame(list(solved_rate.items()), columns=['cardinality', 'solved_rate'])
    fig = px.bar(
            df,
            x='cardinality',
            y='solved_rate',
            title=f'Agent Evaluation Results for {name}',
            template='simple_white',
            labels={
                'cardinality': 'list cardinality',
                'solved_rate': 'solved tasks rate'
            },
            height=400
    )
    fig.update_xaxes(dtick=1)
    fig.update_traces(marker_color=COLOR_VALID)
    fig.show()

## Default Network Architecture

In [4]:
model_a2c_name_default = 'a2c_r4_64x64_c1to32_t2xx18_lrfix'

### Train

In [5]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env

vec_env = make_vec_env(experiment.create_filtered_train_env, n_envs=N_VEC_ENVS)
policy_kwargs = dict(
        net_arch=[64, 64]
)
model_a2c_default = A2C(
        'MultiInputPolicy',
        vec_env,
        policy_kwargs=policy_kwargs,
        verbose=1,
        tensorboard_log='./artifacts/tensorboard'
)
if ENABLE_TRAINING:
    model_a2c_default.learn(total_timesteps=2 ** 18, tb_log_name=model_a2c_name_default)
    mean_reward, std_reward = evaluate_policy(model_a2c_default, model_a2c_default.get_env(), n_eval_episodes=EVAL_N_EPISODES)
    print(f"Mean reward: {mean_reward} +/- {std_reward}")
    model_a2c_default.save(f'./artifacts/models/{model_a2c_name_default}')

### Evaluate

In [6]:
from stable_baselines3 import A2C
model_a2c_default = A2C.load(f'./artifacts/models/{model_a2c_name_default}')

In [7]:
env, filtered_env  = experiment.created_eval_env_tuple()
obs, info = filtered_env.reset()
print(f'Complexity: {env.local_complexity}')
for i in range(100):
    action, _states = model_a2c_default.predict(obs)
    decoded_action = env.decode_action(action)
    obs, rewards, terminated, truncated, info = filtered_env.step(action)
    local_complexity = env.local_complexity
    print(f'local complexity: {local_complexity} action: {decoded_action.type.name}-{decoded_action.operation.name if decoded_action.operation is not None else None} = {float(rewards)}')
    if terminated or truncated:
        obs, info = filtered_env.reset()
        print(f'Complexity: {env.local_complexity}')

In [8]:
from auto_graph_of_thoughts.experiment.evaluate_agent import evaluate_agent

if ENABLE_EVALUATION:
    evaluation_a2c = evaluate_agent(
            experiment,
            model_a2c_name_default,
            EVAL_N_EPISODES,
            lambda obs: model_a2c_default.predict(obs)[0]
    )
    store_evaluation_summary(evaluation_a2c.summary)

evaluation_a2c_summary = load_evaluation_summary(model_a2c_name_default)
evaluation_a2c_summary.solved_rate_train_complexities, evaluation_a2c_summary.solved_rate_eval_complexities

In [9]:
visualize_solved_rate(model_a2c_name_default, evaluation_a2c_summary.solved_rate_per_complexity)

## Network Architecture 32x32

In [10]:
model_a2c_name_32x32 = 'a2c_r4_32x32_c1to32_t2xx18_lrfix'

### Train

In [11]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env

vec_env = make_vec_env(experiment.create_filtered_train_env, n_envs=N_VEC_ENVS)
policy_kwargs = dict(
        net_arch=[32, 32]
)
model_a2c_32x32 = A2C(
        'MultiInputPolicy',
        vec_env,
        policy_kwargs=policy_kwargs,
        verbose=1,
        tensorboard_log='./artifacts/tensorboard'
)

if ENABLE_TRAINING:
    model_a2c_32x32.learn(total_timesteps=2 ** 18, tb_log_name=model_a2c_name_32x32)
    mean_reward, std_reward = evaluate_policy(model_a2c_32x32, model_a2c_32x32.get_env(), n_eval_episodes=EVAL_N_EPISODES)
    print(f'Mean reward: {mean_reward} +/- {std_reward}')
    model_a2c_32x32.save(f'./artifacts/models/{model_a2c_name_32x32}')

### Evaluate

In [12]:
from stable_baselines3 import A2C
model_a2c_32x32 = A2C.load(f'./artifacts/models/{model_a2c_name_32x32}')

In [13]:
env, filtered_env  = experiment.created_eval_env_tuple()
obs, info = filtered_env.reset()
print(f'Complexity: {env.local_complexity}')
for i in range(100):
    action, _states = model_a2c_32x32.predict(obs)
    decoded_action = env.decode_action(action)
    obs, rewards, terminated, truncated, info = filtered_env.step(action)
    local_complexity = env.local_complexity
    print(f'local complexity: {local_complexity} action: {decoded_action.type.name}-{decoded_action.operation.name if decoded_action.operation is not None else None} = {float(rewards)}')
    if terminated or truncated:
        obs, info = filtered_env.reset()
        print(f'Complexity: {env.local_complexity}')

In [14]:
from auto_graph_of_thoughts.experiment.evaluate_agent import evaluate_agent

if ENABLE_EVALUATION:
    evaluation_a2c_32x32 = evaluate_agent(
            experiment,
            model_a2c_name_32x32,
            EVAL_N_EPISODES,
            lambda obs: model_a2c_32x32.predict(obs)[0]
    )
    store_evaluation_summary(evaluation_a2c_32x32.summary)
evaluation_a2c_32x32_summary = load_evaluation_summary(model_a2c_name_32x32)
evaluation_a2c_32x32_summary.solved_rate_train_complexities, evaluation_a2c_32x32_summary.solved_rate_eval_complexities

In [15]:
visualize_solved_rate(model_a2c_name_32x32, evaluation_a2c_32x32_summary.solved_rate_per_complexity)

## Linear Learning Rate Decrease

In [16]:
model_a2c_name_lrlin = 'a2c_r4_64x64_c1to32_t2xx18_lrlin'

### Train

In [17]:
from typing import Callable

def learning_rate_linear_schedule(initial_value: float) -> Callable[[float], float]:
    return lambda progress_remaining: initial_value * progress_remaining

In [18]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env

vec_env = make_vec_env(experiment.create_filtered_train_env, n_envs=N_VEC_ENVS)
policy_kwargs = dict(
        net_arch=[64, 64]
)
model_a2c_lrlin = A2C(
        'MultiInputPolicy',
        vec_env,
        policy_kwargs=policy_kwargs,
        verbose=1,
        learning_rate=learning_rate_linear_schedule(0.01),
        tensorboard_log='./artifacts/tensorboard'
)
if ENABLE_TRAINING:
    model_a2c_lrlin.learn(total_timesteps=2 ** 18, tb_log_name=model_a2c_name_lrlin)
    mean_reward, std_reward = evaluate_policy(model_a2c_lrlin, model_a2c_lrlin.get_env(), n_eval_episodes=EVAL_N_EPISODES)
    print(f'Mean reward: {mean_reward} +/- {std_reward}')
    model_a2c_lrlin.save(f'./artifacts/models/{model_a2c_name_lrlin}')

In [19]:
env, filtered_env  = experiment.created_eval_env_tuple()
obs, info = filtered_env.reset()
print(f'Complexity: {env.local_complexity}')
for i in range(100):
    action, _states = model_a2c_lrlin.predict(obs)
    decoded_action = env.decode_action(action)
    obs, rewards, terminated, truncated, info = filtered_env.step(action)
    local_complexity = env.local_complexity
    print(f'local complexity: {local_complexity} action: {decoded_action.type.name}-{decoded_action.operation.name if decoded_action.operation is not None else None} = {float(rewards)}')
    if terminated or truncated:
        obs, info = filtered_env.reset()
        print(f'Complexity: {env.local_complexity}')

### Evaluate

In [20]:
from stable_baselines3 import A2C
model_a2c_lrlin = A2C.load(f'./artifacts/models/{model_a2c_name_lrlin}')

In [21]:
from auto_graph_of_thoughts.experiment.evaluate_agent import evaluate_agent

if ENABLE_EVALUATION:
    evaluation_a2c_lrlin = evaluate_agent(
            experiment,
            model_a2c_name_lrlin,
            EVAL_N_EPISODES,
            lambda obs: model_a2c_lrlin.predict(obs)[0]
    )
    store_evaluation_summary(evaluation_a2c_lrlin.summary)
evaluation_a2c_lrlin_summary = load_evaluation_summary(model_a2c_name_lrlin)
evaluation_a2c_lrlin_summary.solved_rate_train_complexities, evaluation_a2c_lrlin_summary.solved_rate_eval_complexities

In [22]:
visualize_solved_rate(model_a2c_name_lrlin, evaluation_a2c_lrlin_summary.solved_rate_per_complexity)