In [1]:
!pip install numpy==1.24.4



In [3]:
!pip install gymnasium
!pip install pygame
!pip install wheel setuptools pip --upgrade
!pip install swig
!pip install gymnasium[box2d]

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
[33m  DEPRECATION: Building 'box2d-py' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'box2d-py'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379445 sha256=1f9c60936936e9c3e6ad69ae08ad27afe267497eb2577faa70122be489a08d2d
  Stored in directory: /root/.cache/pip/wheels/ab/f1/0c/d56f4a2bdd12bae0a0693ec33f2f0daadb5eb

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.affine = nn.Linear(8, 128)

        self.action_layer = nn.Linear(128, 4)
        self.value_layer = nn.Linear(128, 1)

        self.logprobs = []
        self.state_values = []
        self.rewards = []

    def forward(self, state):
        state = torch.from_numpy(state).float()
        state = F.relu(self.affine(state))

        state_value = self.value_layer(state)

        action_probs = F.softmax(self.action_layer(state))
        action_distribution = Categorical(action_probs)
        action = action_distribution.sample()

        self.logprobs.append(action_distribution.log_prob(action))
        self.state_values.append(state_value)

        return action.item()

    def calculateLoss(self, gamma=0.99):

        # calculating discounted rewards:
        rewards = []
        dis_reward = 0
        for reward in self.rewards[::-1]:
            dis_reward = reward + gamma * dis_reward
            rewards.insert(0, dis_reward)

        # normalizing the rewards:
        rewards = torch.tensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std())

        loss = 0
        for logprob, value, reward in zip(self.logprobs, self.state_values, rewards):
            advantage = reward  - value.item()
            action_loss = -logprob * advantage
            value_loss = F.smooth_l1_loss(value, reward)
            loss += (action_loss + value_loss)
        return loss

    def clearMemory(self):
        del self.logprobs[:]
        del self.state_values[:]
        del self.rewards[:]

In [None]:
import torch
import torch.optim as optim
import gym

def train():
    # Defaults parameters:
    #    gamma = 0.99
    #    lr = 0.02
    #    betas = (0.9, 0.999)
    #    random_seed = 543

    render = False
    gamma = 0.99
    lr = 0.02
    betas = (0.9, 0.999)
    random_seed = 543

    torch.manual_seed(random_seed)

    env = gym.make('LunarLander-v2')
    env.seed(random_seed)

    policy = ActorCritic()
    optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
    print(lr,betas)

    running_reward = 0
    for i_episode in range(0, 2000):
        state = env.reset()
        for t in range(10000):
            action = policy(state)
            state, reward, done, _ = env.step(action)
            policy.rewards.append(reward)
            running_reward += reward
            if render and i_episode > 1000:
                env.render()
            if done:
                break

        # Updating the policy :
        optimizer.zero_grad()
        loss = policy.calculateLoss(gamma)
        loss.backward()
        optimizer.step()
        policy.clearMemory()

        # saving the model if episodes > 999 OR avg reward > 200
        #if i_episode > 999:
        #    torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))

        # if running_reward > 4000:
        #     torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
        #     print("########## Solved! ##########")
        #     test(name='LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
        #     break

        if i_episode % 20 == 0:
            running_reward = running_reward/20
            print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward))
            running_reward = 0

if __name__ == '__main__':
    train()

0.02 (0.9, 0.999)
Episode 0	length: 66	reward: -3.1183604976256825


  action_probs = F.softmax(self.action_layer(state))
  value_loss = F.smooth_l1_loss(value, reward)


Episode 20	length: 59	reward: -326.64259672798323
Episode 40	length: 96	reward: -358.12689133770755
Episode 60	length: 127	reward: -223.85821546824187
Episode 80	length: 98	reward: -175.84287969337993
Episode 100	length: 76	reward: -126.47985032201078
Episode 120	length: 119	reward: -117.58751803705684
Episode 140	length: 100	reward: -158.13668947017345
Episode 160	length: 99	reward: -183.29623358281157
Episode 180	length: 116	reward: -107.12636175346165
Episode 200	length: 158	reward: -171.42489777473867
Episode 220	length: 115	reward: -120.69973696310183
Episode 240	length: 94	reward: -74.21016911106044
Episode 260	length: 121	reward: -47.000315659780995
Episode 280	length: 91	reward: -63.82739069753311
Episode 300	length: 236	reward: -32.1469801595273
Episode 320	length: 106	reward: -74.75074885411712
Episode 340	length: 117	reward: 8.268474940183026
Episode 360	length: 164	reward: -108.68407995660554
Episode 380	length: 196	reward: -167.12174770513968
Episode 400	length: 101	reward

In [None]:
import torch
import gym
from PIL import Image

def test(n_episodes=5, name='LunarLander_TWO.pth'):
    env = gym.make('LunarLander-v2')
    policy = ActorCritic()

    policy.load_state_dict(torch.load('./preTrained/{}'.format(name)))

    render = True
    save_gif = False

    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        running_reward = 0
        for t in range(10000):
            action = policy(state)
            state, reward, done, _ = env.step(action)
            running_reward += reward
            if render:
                 env.render()
                 if save_gif:
                     img = env.render(mode = 'rgb_array')
                     img = Image.fromarray(img)
                     img.save('./gif/{}.jpg'.format(t))
            if done:
                break
        print('Episode {}\tReward: {}'.format(i_episode, running_reward))
    env.close()

if __name__ == '__main__':
    test()

FileNotFoundError: [Errno 2] No such file or directory: './preTrained/LunarLander_TWO.pth'