In [2]:
%load_ext autoreload
%autoreload 2

import thread_the_needle as ttn


from vi_ppo.actor_critic import ActorCritic
from vi_ppo.nets.mlp import Mlp
from vi_ppo.nets.cnn import Cnn
from vi_ppo.rl_module import RlModule
import lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger


In [3]:
# Initialise the environment
env = ttn.make("thread_the_needle")

# make the actor critic model
d = env.observation_space.shape
n_a = env.action_space.n
hidden_dims = 16


print("Observation space: ", d) 
print("Action space: ", n_a)

Observation space:  (1, 64, 64)
Action space:  4


In [4]:
env.reset()

(array([[[ 0,  1,  1, ...,  0,  0,  0],
         [ 1,  1,  1, ...,  1,  0,  0],
         [ 1,  1,  1, ...,  1,  1,  1],
         ...,
         [14, 15, 16, ...,  0,  0,  0],
         [13, 14, 15, ...,  0,  0,  0],
         [11, 12, 13, ...,  0,  0,  0]]], shape=(1, 64, 64)),
 {})

In [None]:
feature_extractor_config = Cnn.config_cls(
    input_channels=1, 
    channels=[16,32,64,1], 
    kernel_sizes=[8,4,3,1], 
    strides=[4,2,1,1], 
    flatten_output=True,
    float_input = False,
    )
feature_extractor_config

CnnConfig(input_channels=1, channels=[16, 32, 64, 1], kernel_sizes=[8, 4, 3, 1], strides=[4, 2, 1, 1], activation='elu', flatten_output=True)

In [6]:
Cnn(feature_extractor_config)

Cnn(
  (cnn): Sequential(
    (0): ConvBlock(
      (conv): Conv2d(1, 16, kernel_size=(8, 8), stride=(4, 4), padding=(1, 1))
      (act): ELU(alpha=1.0)
    )
    (1): ConvBlock(
      (conv): Conv2d(16, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
      (act): ELU(alpha=1.0)
    )
    (2): ConvBlock(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (act): ELU(alpha=1.0)
    )
    (3): ConvBlock(
      (conv): Conv2d(64, 1, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1))
      (act): ELU(alpha=1.0)
    )
  )
)

In [7]:
Cnn(feature_extractor_config).calculate_output_shape(input_shape=(1,64,64))

torch.Size([1, 81])

In [8]:
feature_extractor_config = Cnn.config_cls(
    input_channels=1, channels=[16,32,64,1], kernel_sizes=[8,4,3,1], strides=[4,2,1,1], flatten_output=True,
    )


feature_extractor = Cnn(feature_extractor_config)

embedding_dims = feature_extractor.calculate_output_shape(input_shape=(1,64,64))[1]



actor_config = Mlp.config_cls(
    input_dims=embedding_dims, 
    output_dims=n_a, 
    hidden_dims=hidden_dims,
    n_layers=1, 
    activation="silu",
)
critic_config = Mlp.config_cls(
    input_dims=embedding_dims, 
    output_dims=1, 
    hidden_dims=hidden_dims, 
    n_layers=1, 
    activation="silu",
)
ac_config = ActorCritic.config_cls(
    clip_epsilon=0.2, 
    value_coeff=0.5, 
    entropy_coeff=0.01
)

model = ActorCritic(
    ac_config, 
    actor_net=Mlp(actor_config), 
    critic=Mlp(critic_config), 
    feature_extractor=feature_extractor
    )


# module

In [9]:

config = RlModule.config_class(lr=3e-4)
module = RlModule(actor_critic=model, env=env, config=config)

logger = TensorBoardLogger("lightning_logs", name="thread_the_needle")
trainer = pl.Trainer(max_epochs=2, logger=logger)

trainer.fit(module)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type        | Params | Mode 
-----------------------------------------------------
0 | actor_critic | ActorCritic | 31.1 K | train
-----------------------------------------------------
31.1 K    Trainable params
0         Non-trainable params
31.1 K    Total params
0.124     Total estimated model params size (MB)
27        Modules in train mode
0         Modules in eval mode
/Users/nicholasfranklin/miniconda3/envs/vi_ppo/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/nicholasfranklin/miniconda3/envs/vi_ppo/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (1

Training: |          | 0/? [00:00<?, ?it/s]

RuntimeError: Input type (long long) and bias type (float) should be the same

In [None]:
d

In [None]:
# Initialise the environment
env = gym.make("LunarLander-v3", render_mode="human")

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
for _ in range(5000):
    # this is where you would insert your policy
    action = module.predict(observation)

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()

In [None]:
# !pip install -e ~/Projects/thread_the_needle/
import thread_the_needle as ttn
ttn.make('thread_the_needle')