In [None]:
### REINFORCE pytorch implementation

import argparse
import gym
import numpy as np
from itertools import count
from collections import deque
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

args = {"gamma": 0.99, "seed": 543, "render": True, "log_interval": 10}

env = gym.make('CartPole-v1')
env.reset(seed=args['seed'])
torch.manual_seed(args['seed'])


class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 128)
        self.affine2 = nn.Linear(128, 2)

        self.saved_log_probs = []
        self.rewards = []
        
    def forward(self, x):
        x = self.affine1(x)
        x = F.relu(x)
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)


policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()


def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()


def finish_episode():
    R = 0
    policy_loss = []
    returns = deque()
    for r in policy.rewards[::-1]:
        R = r + args['gamma'] * R
        returns.appendleft(R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]


def main():
    step_counts = []
    running_reward = 10
    for i_episode in count(1):
        state, _ = env.reset()
        ep_reward = 0
        for t in range(1, 10000):  # Don't infinite loop while learning
            action = select_action(state)
            state, reward, done, _, _ = env.step(action)
            if args['render']:
                env.render()
            policy.rewards.append(reward)
            ep_reward += reward
            if done:
                step_counts.append(t)
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        finish_episode()
        if i_episode % args['log_interval'] == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

    plt.plot(step_counts)
    plt.xlabel('Episode')
    plt.show()
    


if __name__ == '__main__':
    main()

In [None]:
### pytorch Actor Critic with episodic updates and shared first linear layer

import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Cart Pole

args = {"gamma": 0.99, "seed": 543, "render": True, "log_interval": 10}

env = gym.make('CartPole-v1')
env.reset(seed=args['seed'])
torch.manual_seed(args['seed'])


SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])


class Policy(nn.Module):
    """
    implements both actor and critic in one model
    """
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 128)

        # actor's layer
        self.action_head = nn.Linear(128, 2)

        # critic's layer
        self.value_head = nn.Linear(128, 1)

        # action & reward buffer
        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        """
        forward of both actor and critic
        """
        x = F.relu(self.affine1(x))

        # actor: choses action to take from state s_t
        # by returning probability of each action
        action_prob = F.softmax(self.action_head(x), dim=-1)

        # critic: evaluates being in the state s_t
        state_values = self.value_head(x)

        # return values for both actor and critic as a tuple of 2 values:
        # 1. a list with the probability of each action over the action space
        # 2. the value from state s_t
        return action_prob, state_values


model = Policy()
optimizer = optim.Adam(model.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()


def select_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = model(state)

    # create a categorical distribution over the list of probabilities of actions
    m = Categorical(probs)

    # and sample an action using the distribution
    action = m.sample()

    # save to action buffer
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))

    # the action to take (left or right)
    return action.item()


def finish_episode():
    """
    Training code. Calculates actor and critic loss and performs backprop.
    """
    R = 0
    saved_actions = model.saved_actions
    policy_losses = [] # list to save actor (policy) loss
    value_losses = [] # list to save critic (value) loss
    returns = [] # list to save the true values

    # calculate the true value using rewards returned from the environment
    for r in model.rewards[::-1]:
        # calculate the discounted value
        R = r + args['gamma'] * R
        returns.insert(0, R)

    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)

    for (log_prob, value), R in zip(saved_actions, returns):
        advantage = R - value.item()

        # calculate actor (policy) loss
        policy_losses.append(-log_prob * advantage)

        # calculate critic (value) loss using L1 smooth loss
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))

    # reset gradients
    optimizer.zero_grad()

    # sum up all the values of policy_losses and value_losses
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()

    # perform backprop
    loss.backward()
    optimizer.step()

    # reset rewards and action buffer
    del model.rewards[:]
    del model.saved_actions[:]


def main():
    running_reward = 10

    # run infinitely many episodes
    for i_episode in count(1):

        # reset environment and episode reward
        state, _ = env.reset()
        ep_reward = 0

        # for each episode, only run 9999 steps so that we don't
        # infinite loop while learning
        for t in range(1, 10000):

            # select action from policy
            action = select_action(state)

            # take the action
            state, reward, done, _, _ = env.step(action)

            if args['render']:
                env.render()

            model.rewards.append(reward)
            ep_reward += reward
            if done:
                break

        # update cumulative reward
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

        # perform backprop
        finish_episode()

        # log results
        if i_episode % args['log_interval'] == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))

        # check if we have "solved" the cart pole problem
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break


if __name__ == '__main__':
    main()

In [None]:
### MY edit to pytorch actor critic implementation. split the model to two distinct networks.

import argparse
import gym
import numpy as np
from itertools import count
from collections import deque
from collections import namedtuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

args = {"gamma": 0.99, "seed": 543, "render": True, "log_interval": 10}

env = gym.make('CartPole-v1')
env.reset(seed=args['seed'])
torch.manual_seed(args['seed'])

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

class A2C(nn.Module):
    def __init__(self):
        super(A2C, self).__init__()
        
        ## pytorch example above has same first linear layer but two different linear outputs.
        ## mine just separates it and performs better?
        self.actor1 = nn.Linear(4, 128)
        self.actor2 = nn.Linear(128, 2)
        
        self.critic1 = nn.Linear(4, 128)
        self.critic2 = nn.Linear(128, 1)

        self.saved_actions = []
        self.rewards = []
        
    def forward(self, x):
        policy_prob_dist = F.softmax(self.actor2(F.relu(self.actor1(x))), dim=1)
        value = self.critic2(F.relu(self.critic1(x)))
        
        return policy_prob_dist, value


model = A2C()
optimizer = optim.Adam(model.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()


def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    action_probs, state_value = model(state)
    m = Categorical(action_probs)
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.item()


def finish_episode():
    R = 0
    saved_actions = model.saved_actions
    policy_losses = [] # list to save actor (policy) loss
    value_losses = [] # list to save critic (value) loss
    returns = deque()
    
    for r in model.rewards[::-1]:
        R = r + args['gamma'] * R
        returns.appendleft(R)
    
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    
    for (log_prob, value), R in zip(saved_actions, returns):
        advantage = R - value.item()
        policy_losses.append(-log_prob * advantage)
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))
    
    optimizer.zero_grad()
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    loss.backward()
#     retain_graph=True
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]


def main():
    step_counts = []
    running_reward = 10
    for i_episode in count(1):
        state, _ = env.reset()
        ep_reward = 0
        for t in range(1, 10000):  # Don't infinite loop while learning
            action = select_action(state)
            state, reward, done, _, _ = env.step(action)
            if args['render']:
                env.render()
            model.rewards.append(reward)
            ep_reward += reward
            if done:
                step_counts.append(t)
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        finish_episode()
        if i_episode % args['log_interval'] == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

    plt.plot(step_counts)
    plt.xlabel('Episode')
    plt.show()
    


if __name__ == '__main__':
    main()

In [None]:
### MY implementation of actor critic that follows the algorithm under
### One-step Actor–Critic (episodic)

import argparse
import gym
import numpy as np
from itertools import count
from collections import deque
from collections import namedtuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt

class Actor(nn.Module):
    def __init__(self, env):
        super(Actor, self).__init__()
        self.actor1 = nn.Linear(env.observation_space.shape[0] , 128)
        self.actor2 = nn.Linear(128, env.action_space.n)
        
    def forward(self, x):
        return torch.softmax(self.actor2(torch.relu(self.actor1(x))), dim=-1)

class Critic(nn.Module):
    def __init__(self, env):
        super(Critic, self).__init__()
        self.critic1 = nn.Linear(env.observation_space.shape[0] , 128)
        self.critic2 = nn.Linear(128, 1)
        
    def forward(self, x):
        return self.critic2(torch.relu(self.critic1(x)))

args = {"gamma": 0.99, "seed": 543, "render": True, "log_interval": 10, "actor_lr": 1e-3, "critic_lr": 1e-3}
gamma = args['gamma']
seed = args['seed']
render = args['render']
log_interval = args['log_interval']
actor_lr = args['actor_lr']
critic_lr = args['critic_lr']

env = gym.make('CartPole-v1')
env.reset(seed=seed)
torch.manual_seed(seed)

actor = Actor(env)
critic = Critic(env)
actor_optim = optim.Adam(actor.parameters(), lr=actor_lr)
critic_optim = optim.Adam(critic.parameters(), lr=critic_lr)

def policy(state):
    action_probs = actor(state)
    m = Categorical(action_probs)
    action = m.sample()
    return action, m.log_prob(action)

def main():
    step_counts = []
    last_actor_losses = []
    last_critic_losses = []
    running_reward = 10
    for i_episode in count(1):
        S, _ = env.reset()
        S = torch.tensor(S, requires_grad=True).float().unsqueeze(0)
        
        ep_reward = 0
        I = 1
        for t in range(1, 10000):  # Don't infinite loop while learning
            A, log_prob = policy(S)
            S2, R, done, _, _ = env.step(A.item())
            S2 = torch.tensor(S2, requires_grad=True).float().unsqueeze(0)
            
            if render:
                env.render()
            ep_reward += R
            
            error = R + (1.0 - done) * gamma * critic(S2) - critic(S)
            error.requires_grad_()

            actor_optim.zero_grad()
            actor_loss = - log_prob * error #* I
            actor_loss.backward(retain_graph=True)
            actor_optim.step()
            
            critic_optim.zero_grad()
            critic_loss = torch.square(error)
            critic_loss.backward(retain_graph=True)
            critic_optim.step()
            
            I = gamma * I
            S = S2
            
            if done:
                step_counts.append(t)
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        if i_episode % log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

    plt.plot(step_counts)
    plt.xlabel('Episode')
    plt.show()
    


if __name__ == '__main__':
    main()

In [None]:
### MY implementation of actor critic that follows the algorithm under
### One-step Actor–Critic (episodic)

import argparse
import gymnasium as gym
import numpy as np
from itertools import count
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.distributions import Categorical

import matplotlib.pyplot as plt

In [None]:
args = {"gamma": 0.995, "seed": 1, "render": True, "log_interval": 50, "actor_lr": 1e-2, "critic_lr": 1e-2}
gamma = args['gamma']
seed = args['seed']
render = args['render']
log_interval = args['log_interval']
actor_lr = args['actor_lr']
critic_lr = args['critic_lr']

In [None]:
env = gym.make('MountainCar-v0')
torch.manual_seed(seed)
np.random.seed(seed)

num_state = env.observation_space.shape[0]
num_action = env.action_space.n
float_epsilon = np.finfo(np.float32).eps.item()

class Actor(nn.Module):
    def __init__(self, env):
        super(Actor, self).__init__()
        self.actor1 = nn.Linear(num_state, 128)
        self.actor2 = nn.Linear(128, num_action)

    def forward(self, x):
        x = self.actor1(x)
        x = F.relu(x)
        x = self.actor2(x)
        return torch.softmax(x, dim=-1)

class Critic(nn.Module):
    def __init__(self, env):
        super(Critic, self).__init__()
        self.critic1 = nn.Linear(num_state, 128)
        self.critic2 = nn.Linear(128, 1)

    def forward(self, x):
        x = self.critic1(x)
        x = F.relu(x)
        x = self.critic2(x)
        return x

In [None]:
def policy(actor, state):
    action_probs = actor(state)
    m = Categorical(action_probs)
    action = m.sample()
    return action, m.log_prob(action)

In [None]:
def plot(steps, avg_steps):
    ax = plt.subplot(111)
    ax.cla()
    ax.grid()
    ax.set_title('Training')
    ax.set_xlabel('Episode')
    ax.set_ylabel('Run Time')
    ax.plot(steps)
    ax.plot(avg_steps)
    RunTime = len(steps)
    # if len(steps) % 200 == 0:
    #     path = './AC_MountainCar-v0/' + 'RunTime' + str(RunTime) + '.jpg'
    #     plt.savefig(path)
    plt.pause(0.0000001)

def finish_episode(values, log_probs, rewards, actor, actor_optim, critic, critic_optim):
    # Figure out returns
    returns = []
    G = 0
    for r in rewards[::-1]:
        G = r + gamma * G
        returns.insert(0, G)

    # Normalize the returns
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + float_epsilon)
    
    # Figure out loss
    policy_losses = []
    value_losses = []
    for value, log_prob, g in zip(values, log_probs, returns):
        advantage = g - value.item() # g = r + gamma * g' | g' is not bootstrapping V(S2) but it works
        policy_losses.append(-log_prob * advantage)
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([[g]])))

    # Backpropagate
    actor_optim.zero_grad()
    actor_loss = torch.stack(policy_losses).sum()
    actor_loss.backward(retain_graph=True)
    actor_optim.step()

    critic_optim.zero_grad()
    critic_loss = torch.stack(value_losses).sum()
    critic_loss.backward(retain_graph=True)
    critic_optim.step()

    del values[:]
    del log_probs[:]
    del rewards[:]

def train():
    actor = Actor(env)
    critic = Critic(env)
    actor_optim = optim.Adam(actor.parameters(), lr=actor_lr)
    critic_optim = optim.Adam(critic.parameters(), lr=critic_lr)
    
    steps = []
    avg_steps = []

    # IMPORTANT: Using a replay buffer finally made Actor-Critic work, and it is much faster :)
    values = []
    log_probs = []
    rewards = []
    
    for episode in count(1):
        S, _ = env.reset()
        S = torch.tensor(S, requires_grad=True).float().unsqueeze(0)
        
        for t in count(1):
            value = critic(S)
            values.append(value)
            
            A, log_prob = policy(actor, S)
            log_probs.append(log_prob)
            
            S2, R, done, _, _ = env.step(A.item())
            S2 = torch.tensor(S2, requires_grad=True).float().unsqueeze(0)
            rewards.append(R)
            
            S = S2

            if done or t > 50000:
                steps.append(t)
                avg_steps.append(np.mean(steps[-min(100,len(steps)):]))

                print(f"Episode: {episode}\tSteps {t}")
                break

        finish_episode(values, log_probs, rewards, actor, actor_optim, critic, critic_optim)
        plot(steps, avg_steps)

        if episode % 100 == 0:
            path = f'./AC_MountainCar-v0_Model/actor{episode}.pth'
            torch.save(actor, path)
            path = f'./AC_MountainCar-v0_Model/critic{episode}.pth'
            torch.save(critic, path)

        if avg_steps[-1] < 140:
            print(f"Took {episode} episodes to reach 100 episode moving avg of {avg_steps[-1]} time steps.")
            path = f'./AC_MountainCar-v0_Model/actor{episode}.pth'
            torch.save(actor, path)
            path = f'./AC_MountainCar-v0_Model/critic{episode}.pth'
            torch.save(critic, path)
            break
    
    del steps
    del avg_steps
    del values
    del log_probs
    del rewards


In [None]:
train()

In [None]:
episode_count = 3065

In [None]:
def train2(avg_episode_len_target=200, check_episode=200, new_lr=1e-4, quit=float("inf")):
    global episode_count
    global log_interval
    
    env = gym.make('MountainCar-v0')
    
    print(f"loading {episode_count} episode model")
    actor = torch.load(f'./AC_MountainCar-v0_Model/actor{episode_count}.pth')
    actor.train()
    critic = torch.load(f'./AC_MountainCar-v0_Model/critic{episode_count}.pth')
    critic.train()

    actor_optim = optim.Adam(actor.parameters(), lr=new_lr)
    critic_optim = optim.Adam(critic.parameters(), lr=new_lr)
    
    steps = []
    avg_steps = []

    values = []
    log_probs = []
    rewards = []
    
    for episode in count(episode_count+1):
        S, _ = env.reset()
        S = torch.tensor(S, requires_grad=True).float().unsqueeze(0)
        
        for t in count(1):
            value = critic(S)
            values.append(value)
            
            A, log_prob = policy(actor, S)
            log_probs.append(log_prob)
            
            S2, R, done, _, _ = env.step(A.item())
            S2 = torch.tensor(S2, requires_grad=True).float().unsqueeze(0)
            rewards.append(R)
            
            S = S2

            if done or t > 50000:
                steps.append(t)
                avg_steps.append(np.mean(steps[-min(check_episode,len(steps)):]))

                if episode % log_interval == 0:
                    print(f"Episode: {episode}\tSteps {t}")
                    plot(steps, avg_steps)
                break
        
        if t > 50000:
            del values[:]
            del log_probs[:]
            del rewards[:]
        else:
            finish_episode(values, log_probs, rewards, actor, actor_optim, critic, critic_optim)

        if episode % 100 == 0:
            path = f'./AC_MountainCar-v0_Model/actor{episode}.pth'
            torch.save(actor, path)
            path = f'./AC_MountainCar-v0_Model/critic{episode}.pth'
            torch.save(critic, path)
            
        if episode - episode_count > quit:
            plot(steps, avg_steps)
            print(f"stopping after {quit} episodes")
            path = f'./AC_MountainCar-v0_Model/actor{episode}.pth'
            torch.save(actor, path)
            path = f'./AC_MountainCar-v0_Model/critic{episode}.pth'
            torch.save(critic, path)
            episode_count = episode
            print(f"Current episode count is {episode_count} and the lr was {new_lr}")
            break
        
        if len(steps) > check_episode and avg_steps[-1] < avg_episode_len_target:
            plot(steps, avg_steps)
            print(f"Took another {len(steps)} episodes to have a avg episode length of {avg_steps[-1]} in the past {check_episode} episodes")
            path = f'./AC_MountainCar-v0_Model/actor{episode}.pth'
            torch.save(actor, path)
            path = f'./AC_MountainCar-v0_Model/critic{episode}.pth'
            torch.save(critic, path)
            episode_count = episode
            print(f"Current episode count is {episode_count} and the lr was {new_lr}")
            break
    
    del steps
    del avg_steps
    del values
    del log_probs
    del rewards


In [None]:
def test_model(n_episode, num_eps=1000):
    print(f"loading {n_episode} episode model")
    actor = torch.load(f'./AC_MountainCar-v0_Model/actor{n_episode}.pth')
    actor.eval()

    steps = []
    avg_steps = []

    env = gym.make('MountainCar-v0')
    for episode in range(num_eps):
        S, _ = env.reset()
        S = torch.tensor(S, requires_grad=True).float().unsqueeze(0)

        for t in count(1):
            A, _ = policy(actor, S)
            S2, _, done, _, _ = env.step(A.item())
            S2 = torch.tensor(S2, requires_grad=True).float().unsqueeze(0)

            S = S2

            if done:
                steps.append(t)
                avg_steps.append(np.mean(steps))

                print(f"Episode: {episode}\tSteps {t}")
                break
    
    print('steps',steps)
    print('mean',avg_steps[-1])
    print('fails',sum(1 for x in steps if x >= 200))

In [None]:
test_model(3065)

In [None]:
train2(avg_episode_len_target=130,check_episode=200,new_lr=1e-3)

In [None]:
test_model(6452)

In [None]:
train2(avg_episode_len_target=130,check_episode=200,new_lr=1e-4)

In [None]:
test_model(6653)

In [None]:
train2(avg_episode_len_target=130,check_episode=200,new_lr=1e-5)

In [None]:
test_model(6955)

In [None]:
episode_count = 6955

In [None]:
train2(avg_episode_len_target=130,check_episode=200,new_lr=3e-4)

In [None]:
test_model(7883)

In [None]:
train2(avg_episode_len_target=130,check_episode=200,new_lr=1e-4)

In [None]:
test_model(8241)

In [None]:
train2(avg_episode_len_target=130,check_episode=200,new_lr=1e-5)

In [None]:
test_model(9994)

In [None]:
train2(avg_episode_len_target=130,check_episode=200,new_lr=1e-6)

In [None]:
test_model(10619)

In [None]:
train2(avg_episode_len_target=130,check_episode=200,new_lr=1e-6)

In [None]:
test_model(12463)

In [None]:
train2(avg_episode_len_target=130,check_episode=200,new_lr=7e-4,quit=200)

In [None]:
test_model(episode_count,3000)

In [None]:
# 41518: 10 fails
# 47749: 9 fails
# 54181: 9 fails
# 55990: 10 fails
# 58603: 7 fails

In [None]:
# human intuition solution: use velocity direction to determine policy
import gymnasium as gym
from itertools import count

env = gym.make("MountainCar-v0")

lengths = []

for eps in range(1000):
    S, _ = env.reset()

    for t in count(1):
        A = 0 if S2[1] < 0 else 2
        S2, R, terminated, _, _ = env.step(A)

        if terminated:
            lengths.append(t)
            break

        S = S2

env.close()

print(lengths)
print(sum(lengths)/len(lengths))