## Here we need to add some comments on the algorithm

In [1]:
# Import section

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

import gym
from collections import namedtuple

In [2]:
# Hyper-Parameters

ENV = 'CartPole-v0'
BATCH_SIZE = 16
PERCENTILE = 70
HIDDEN_SIZE = 128
LR = 0.001

## Definition of the Net/Agent

In [3]:
class Net(nn.Module):
    def __init__(self, obs_space, hidden_space, output_space):
        super(Net, self).__init__()
        self.obs_space = obs_space
        self.hidden_space = hidden_space
        self.output_space = output_space
        
        self.Linear_1 = nn.Linear(self.obs_space, self.hidden_space)
        self.Linear_2 = nn.Linear(self.hidden_space, self.output_space)
    
    def forward(self, state):
        output_1 = F.relu(self.Linear_1(state))
        output_final = self.Linear_2(output_1)
        return output_final

## Instantiation of the environment, the Agent and some auxiliary data-structures

In [4]:
env = gym.make(ENV)
env = gym.wrappers.Monitor(env, "./vid", video_callable=lambda episode_id: True,force=True)
net = Net(env.observation_space.shape[0], HIDDEN_SIZE, env.action_space.n)

Episode = namedtuple('Episode', field_names = ['reward', 'steps'])
Episode_Steps = namedtuple('Episode_steps', field_names = ['observation', 'action'])

## Definition of the optimizer, Criterion etc

In [5]:
optimizer = optim.Adam(net.parameters(), LR)
criterion = nn.CrossEntropyLoss()

## Functions needed for the collection of data

In [6]:
def Batch_Selection(environment, agent, batch_size):
    episode_reward = 0
    state = environment.reset()
    episode = []
    list_of_episodes_and_rewards = []
    sm = nn.Softmax(dim = 1)
    
    while True:
        state = torch.FloatTensor([state])       
        actions_logits = agent(state)
        actions_probs = sm(actions_logits).data.cpu().numpy()[0] # [0] is done to eliminate the batch dimension
        final_action = np.random.choice(len(actions_probs), p = actions_probs)
        new_state, reward, is_done, _ = environment.step(final_action)
        
        episode_reward += reward
        
        state = state.squeeze(0).data.cpu().numpy()
        
        one_step_in_environment = Episode_Steps(observation = state, action = final_action)
        episode.append(one_step_in_environment)
        
        if is_done == True:
            completed_episode = Episode(reward = episode_reward, steps = episode)
            list_of_episodes_and_rewards.append(completed_episode)
            
            if len(list_of_episodes_and_rewards) == batch_size:
                yield list_of_episodes_and_rewards
                list_of_episodes_and_rewards = []
                
            state = environment.reset()
            episode_reward = 0
            episode = []
            
        state = new_state
    
def Top_Percentile(episodes, percentile):
    rewards = list(map(lambda x : x.reward, batch))
    reward_treshold = np.percentile(rewards, percentile)
    reward_mean = np.mean(rewards)  
    obs = []
    action = []
    
    for trajectory in batch:
        if trajectory.reward >= reward_treshold:
            obs.extend(map(lambda step: step.observation, trajectory.steps))
            action.extend(map(lambda step: step.action, trajectory.steps))
            
    if len(obs) == 0:
        print(reward_treshold)
        print(rewards)
    obs = torch.FloatTensor(obs)
    action = torch.LongTensor(action)
    
    return obs, action, reward_treshold, reward_mean

## Main Part of the Program

In [None]:
for idx, batch in enumerate(Batch_Selection(env, net, BATCH_SIZE)):
    
    # Next line is needed for the selection of the top k-percentile episodes.
    obs, Best_Act, rewardbt, reward_m = Top_Percentile(batch, PERCENTILE)
    Old_Act = net(obs)
    loss = criterion(Old_Act, Best_Act)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Next Line is for printing results
    print('Step %.d, Loss %.2f, Mean Reward of the last %.d episodes: %.2f'%(idx, loss.item(), BATCH_SIZE, reward_m))
    
    if reward_m > 199:
        print('The environment has been solved')
        break

Step 0, Loss 0.70, Mean Reward of the last 16 episodes: 19.62
Step 1, Loss 0.70, Mean Reward of the last 16 episodes: 22.12
Step 2, Loss 0.69, Mean Reward of the last 16 episodes: 23.31
Step 3, Loss 0.69, Mean Reward of the last 16 episodes: 20.69
Step 4, Loss 0.69, Mean Reward of the last 16 episodes: 22.31
Step 5, Loss 0.69, Mean Reward of the last 16 episodes: 29.38
Step 6, Loss 0.69, Mean Reward of the last 16 episodes: 18.06
Step 7, Loss 0.69, Mean Reward of the last 16 episodes: 21.12
Step 8, Loss 0.68, Mean Reward of the last 16 episodes: 31.00
Step 9, Loss 0.68, Mean Reward of the last 16 episodes: 23.06
Step 10, Loss 0.68, Mean Reward of the last 16 episodes: 26.88
Step 11, Loss 0.68, Mean Reward of the last 16 episodes: 30.25
Step 12, Loss 0.68, Mean Reward of the last 16 episodes: 25.62
Step 13, Loss 0.68, Mean Reward of the last 16 episodes: 31.94
Step 14, Loss 0.67, Mean Reward of the last 16 episodes: 27.88
Step 15, Loss 0.67, Mean Reward of the last 16 episodes: 32.62
St