In [1]:
%load_ext autoreload
%autoreload 2

import gymnasium as gym


from vi_ppo.actor_critic import ActorCritic
from vi_ppo.nets.mlp import Mlp
from vi_ppo.modules import GymnasiumModule
import lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger


In [2]:
# Initialise the environment
env = gym.make("LunarLander-v3")

# make the actor critic model
d = env.observation_space.shape[0]
n_a = env.action_space.n
hidden_dims = 16

# feature_config = Mlp.config_cls(
#     input_dims=d, 
#     output_dims=hidden_dims, 
#     hidden_dims=hidden_dims, 
#     n_layers=3, 
#     activation="silu",
# )

actor_config = Mlp.config_cls(
    input_dims=d, 
    output_dims=n_a, 
    hidden_dims=hidden_dims,
    n_layers=1, 
    activation="silu",
)
critic_config = Mlp.config_cls(
    input_dims=d, 
    output_dims=1, 
    hidden_dims=hidden_dims, 
    n_layers=1, 
    activation="silu",
)
ac_config = ActorCritic.config_cls(
    clip_epsilon=0.2, 
    value_coeff=0.5, 
    entropy_coeff=0.01
)

model = ActorCritic(
    ac_config, 
    actor_net=Mlp(actor_config), 
    critic=Mlp(critic_config), 
    # feature_extractor=Mlp(feature_config)
    )


config = GymnasiumModule.config_class(lr=3e-4)
module = GymnasiumModule(actor_critic=model, env=env, config=config)

logger = TensorBoardLogger("../lightning_logs", name="lunar_lander")
trainer = pl.Trainer(max_epochs=25, logger=logger)

trainer.fit(module)
# module

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type        | Params | Mode 
-----------------------------------------------------
0 | actor_critic | ActorCritic | 917    | train
-----------------------------------------------------
917       Trainable params
0         Non-trainable params
917       Total params
0.004     Total estimated model params size (MB)
14        Modules in train mode
0         Modules in eval mode
/Users/nicholasfranklin/miniconda3/envs/vi_ppo/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/nicholasfranklin/miniconda3/envs/vi_ppo/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (1

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=25` reached.


In [3]:
# Initialise the environment
env = gym.make("LunarLander-v3", render_mode="human")

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
for _ in range(5000):
    # this is where you would insert your policy
    action = module.predict(observation)

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()

2025-02-09 19:48:21.536 python[6867:3974760] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-09 19:48:21.536 python[6867:3974760] +[IMKInputSession subclass]: chose IMKInputSession_Modern


In [4]:
# !pip install -e ~/Projects/thread_the_needle/
import thread_the_needle as ttn
ttn.make('thread_the_needle')

<thread_the_needle.gridworld.GridWorldEnv at 0x118a8e830>