## Cartpole
In chapter 4, we will apply Cross Entropy method on Environment of Cartpole and FrozenLake

Cross Entropy is a model-free, policy-based and on-policy method. This means that:
1. It does not build a model for the environment, it tells agent to do every step.
2. It approximates the policy of the agent. Policy is usually described as probability distribution.
3. It only requires fresh data taken from the Environment.

In [2]:
# Gym is used to initial environment, taking action and getting observation and reward.
import gym
# Namedtuple is used to create custom tuple with pre-defined names
from collections import namedtuple
# Numpy is used to initialize and use fucntions on array
import numpy as np
# For keeping logs
from tensorboardX import SummaryWriter

# For using Pytorch
import torch
# For creating a neural network
import torch.nn as nn
# For using optimizers
import torch.optim as optim

In [3]:
# No. of neurons in the hidden layer
HIDDEN_SIZE = 128
# No. of episodes in each epoch
BATCH_SIZE = 16
# It shows that top 30% result will be used for training
PERCENTILE = 70

In [4]:
class Net(nn.Module):
    '''
    Creating a neural network
    '''
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
        
    def forward(self, x):
        return self.net(x)

In [5]:
# Creating named tuples
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

def iterate_batches(env, net, batch_size):
    '''
    This is a generator fuction that provide infinite observations of batch_size
    '''
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, _ = env.step(action)
        episode_reward += reward
        step = EpisodeStep(observation=obs, action=action)
        episode_steps.append(step)
        if is_done:
            e = Episode(reward = episode_reward, steps=episode_steps)
            batch.append(e)
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
        obs = next_obs

In [6]:
def filter_batch(batch, percentile):
    '''
    From all the episodes in a batch selecting top 30% for training
    '''
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))
    
    train_obs = []
    train_act = []
    for reward, steps in batch:
        if reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, steps))
        train_act.extend(map(lambda step: step.action, steps))
    
    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean

In [10]:
env = gym.make("CartPole-v0")
env = gym.wrappers.Monitor(env, directory="CartPole-v0", force=True)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
print("observation_size: ", obs_size)
print("Actions: ", n_actions)

observation_size:  4
Actions:  2


In [9]:
net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr = 0.01)
writer = SummaryWriter(comment="-cartpole")

In [17]:
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, acts_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
        iter_no, loss_v.item(), reward_m, reward_b))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    if reward_m > 199:
        print("Solved!")
        break
    writer.close()

0: loss=0.668, reward_mean=25.9, rw_bound=28.5
1: loss=0.667, reward_mean=29.0, rw_bound=26.5
2: loss=0.654, reward_mean=32.5, rw_bound=42.5
3: loss=0.646, reward_mean=36.8, rw_bound=46.5
4: loss=0.648, reward_mean=46.6, rw_bound=57.0
5: loss=0.637, reward_mean=62.2, rw_bound=64.0
6: loss=0.644, reward_mean=35.1, rw_bound=40.5
7: loss=0.621, reward_mean=48.4, rw_bound=56.0
8: loss=0.614, reward_mean=48.8, rw_bound=51.5
9: loss=0.616, reward_mean=68.7, rw_bound=96.0
10: loss=0.599, reward_mean=66.5, rw_bound=86.0
11: loss=0.618, reward_mean=70.2, rw_bound=79.5
12: loss=0.571, reward_mean=83.6, rw_bound=90.0
13: loss=0.594, reward_mean=88.6, rw_bound=105.5
14: loss=0.586, reward_mean=84.9, rw_bound=95.5
15: loss=0.571, reward_mean=100.0, rw_bound=114.5
16: loss=0.562, reward_mean=87.1, rw_bound=102.0
17: loss=0.555, reward_mean=80.3, rw_bound=94.0
18: loss=0.547, reward_mean=117.1, rw_bound=152.0
19: loss=0.558, reward_mean=122.6, rw_bound=143.0
20: loss=0.535, reward_mean=127.8, rw_boun