# Lunar Lander with Cross-Entropy Method

In this notebook we look at the lunar lander environment and solve it with the cross-entropy method.

In [1]:
#!pip3 install box2d-py

In [2]:
import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt
%matplotlib inline
from collections import deque

torch.manual_seed(1)
np.random.seed(1)

# Neural Network

We define a simple neural network that generates the action scores based on a given state.

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(obs_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# Generate Episodes

We generate a batch of episodes and remember the traversed states, actions and rewards. To select the next action we use the output of the network. For this we first pass the scores through a softmax to get probabilites. In the second step we sampel from this distribution to get the next action to execute.

In [4]:
def generate_batch(env, batch_size, t_max=5000):
    
    activation = nn.Softmax(dim=1)
    batch_actions,batch_states, batch_rewards = [],[],[]
    
    for b in range(batch_size):
        states, actions = [], []
        total_reward = 0
        s = env.reset()
        for t in range(t_max):
            
            s_v = torch.FloatTensor([s])
            act_probs_v = activation(net(s_v))
            act_probs = act_probs_v.data.numpy()[0]
            a = np.random.choice(len(act_probs), p=act_probs)

            new_s, r, done, info = env.step(a)

            # record sessions like you did before
            states.append(s)
            actions.append(a)
            total_reward += r

            s = new_s
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
                
    return batch_states, batch_actions, batch_rewards

# Training

In the training step, we first use the neural network to generate a batch of episodes and then use the state-action pairs to improve the neural network.

In [5]:
batch_size = 100
session_size = 100
percentile = 80
hidden_size = 200
completion_score = 100
learning_rate = 0.01

env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(n_states, hidden_size, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)

for i in range(session_size):
    
    # generate new episodes
    states, actions, rewards = generate_batch(env, batch_size, t_max=500)

    # TODO-1: here we need to filter out episodes that are not good
    

    # train on the states using actions as targets
    for s_i in range(len(states)):
        optimizer.zero_grad()
        tensor_states = torch.FloatTensor(states[s_i])
        tensor_actions = torch.LongTensor(actions[s_i])
        action_scores_v = net(tensor_states)
        loss_v = objective(action_scores_v, tensor_actions)
        loss_v.backward()
        optimizer.step()

    #show results
    mean_reward = np.mean(rewards)
    print("%d: loss=%.3f, reward_mean=%.1f" % (
            i, loss_v.item(), mean_reward))
    
    #check if 
    if np.mean(rewards)> completion_score:
        print("Environment has been successfullly completed!")

  s_v = torch.FloatTensor([s])


0: loss=1.400, reward_mean=-184.0
1: loss=1.393, reward_mean=-180.3
2: loss=1.381, reward_mean=-172.0
3: loss=1.382, reward_mean=-173.5
4: loss=1.385, reward_mean=-176.4
5: loss=1.403, reward_mean=-172.3
6: loss=1.387, reward_mean=-171.5
7: loss=1.388, reward_mean=-182.9
8: loss=1.381, reward_mean=-171.2
9: loss=1.368, reward_mean=-171.4
10: loss=1.377, reward_mean=-186.2
11: loss=1.382, reward_mean=-177.7
12: loss=1.389, reward_mean=-214.7
13: loss=1.413, reward_mean=-187.9
14: loss=1.376, reward_mean=-210.3
15: loss=1.364, reward_mean=-198.9
16: loss=1.388, reward_mean=-183.4
17: loss=1.373, reward_mean=-212.1
18: loss=1.387, reward_mean=-189.1
19: loss=1.394, reward_mean=-188.5
20: loss=1.405, reward_mean=-193.1


KeyboardInterrupt: 

Use the trained model to play and record one episode. The recorded video will be stored into the video-subfolder on disk.



In [None]:
# TODO-2: Play episode with agent and record it!