In [1]:
%load_ext autoreload
%autoreload 2

import thread_the_needle as ttn


from vi_ppo.actor_critic import ActorCritic
from vi_ppo.nets.mlp import Mlp
from vi_ppo.nets.cnn import Cnn
from vi_ppo.rl_module import RlModule
import lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger


In [2]:
# Initialise the environment
env = ttn.make("thread_the_needle")

# make the actor critic model
d = env.observation_space.shape
n_a = env.action_space.n
hidden_dims = 16


print("Observation space: ", d) 
print("Action space: ", n_a)

Observation space:  (1, 64, 64)
Action space:  4


In [3]:
env.reset()

(array([[[ 0,  0,  0, ...,  6,  5,  4],
         [ 0,  1,  1, ...,  6,  5,  4],
         [ 1,  1,  2, ...,  7,  5,  4],
         ...,
         [ 9, 10, 12, ...,  6,  5,  4],
         [ 8, 10, 11, ...,  5,  4,  3],
         [ 8, 10, 11, ...,  5,  4,  3]]], shape=(1, 64, 64)),
 {})

In [None]:
feature_extractor_config = Cnn.config_cls(
    input_channels=1, 
    channels=[2,4,1], 
    kernel_sizes=[4,4,1], 
    strides=[2,2,1], 
    flatten_output=True,
    )
feature_extractor_config

TypeError: CnnConfig.__init__() got an unexpected keyword argument 'float_input'

In [None]:
Cnn(feature_extractor_config).calculate_output_shape(input_shape=(1,64,64))

In [None]:
feature_extractor_config = Cnn.config_cls(
    input_channels=1, channels=[16,32,64,1], kernel_sizes=[8,4,3,1], strides=[4,2,1,1], flatten_output=True,
    )


feature_extractor = Cnn(feature_extractor_config)

embedding_dims = feature_extractor.calculate_output_shape(input_shape=(1,64,64))[1]



actor_config = Mlp.config_cls(
    input_dims=embedding_dims, 
    output_dims=n_a, 
    hidden_dims=hidden_dims,
    n_layers=1, 
    activation="silu",
)
critic_config = Mlp.config_cls(
    input_dims=embedding_dims, 
    output_dims=1, 
    hidden_dims=hidden_dims, 
    n_layers=1, 
    activation="silu",
)
ac_config = ActorCritic.config_cls(
    clip_epsilon=0.2, 
    value_coeff=0.5, 
    entropy_coeff=0.01
)

model = ActorCritic(
    ac_config, 
    actor_net=Mlp(actor_config), 
    critic=Mlp(critic_config), 
    feature_extractor=feature_extractor
    )


# module

In [None]:

config = RlModule.config_class(lr=3e-4)
module = RlModule(actor_critic=model, env=env, config=config)

logger = TensorBoardLogger("../lightning_logs", name="thread_the_needle")
trainer = pl.Trainer(max_epochs=100, logger=logger)

trainer.fit(module)

In [None]:
d

In [None]:
# Initialise the environment
env = gym.make("LunarLander-v3", render_mode="human")

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
for _ in range(5000):
    # this is where you would insert your policy
    action = module.predict(observation)

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()

In [None]:
# !pip install -e ~/Projects/thread_the_needle/
import thread_the_needle as ttn
ttn.make('thread_the_needle')