In [1]:
from collections import deque
import gym
import numpy as np
import torch
from torch.distributions import Categorical
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
env = gym.make("CartPole-v0")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class Policy(nn.Module):
    def __init__(self, action_space, state_space, device):
        super(Policy, self).__init__()
        
        self.layer_1 = nn.Linear(state_space, 16)
        self.layer_2 = nn.Linear(16, action_space)
        
    def forward(self, input):
        input = torch.from_numpy(input).float().unsqueeze(0).to(device)
        x = F.relu(self.layer_1(input))
        x = self.layer_2(x)
        
        x = F.softmax(x, dim=1)
        
        prob = Categorical(x)
        action = prob.sample()
        
        return action.item(), prob.log_prob(action)

In [6]:
agent = Policy(2,4,device)
last_results = deque(maxlen=100)
optimizer = optim.Adam(agent.parameters(), lr = 0.01)
gamma = 1.0
episode = 0

while True:
    state = env.reset()
    rewards = []
    log_probs = []
    episode += 1
    while True:
        action, log_prob = agent(state)
        log_probs.append(log_prob)
        state, reward, done, _ = env.step(action)
        rewards.append(reward)
        if done:
            break
    last_results.append(np.sum(rewards))
    discount_factor = [gamma**i for i in range(len(rewards)+1)]
    discounted_reward = np.sum([reward*discount for reward, discount in zip(rewards, discount_factor)])
    
    log_loss = []
    for log in log_probs:
        log_loss.append(-log*discounted_reward)
    log_loss = torch.cat(log_loss).sum()
    
    optimizer.zero_grad()
    log_loss.backward()
    optimizer.step()
    
    if np.mean(last_results) >= 190.0:
        print("AI won in episode ", episode, " with score ", np.mean(last_results))
        break
    if episode%100 == 0:
        print(episode, " reward = ", np.sum(rewards), " | loss = ", log_loss.data, " | last 100 mean = ", np.mean(last_results))
    

100  reward =  78.0  | loss =  tensor(3930.6082)  | last 100 mean =  24.26
200  reward =  72.0  | loss =  tensor(2360.4834)  | last 100 mean =  43.28
300  reward =  31.0  | loss =  tensor(414.5602)  | last 100 mean =  79.09
400  reward =  111.0  | loss =  tensor(5097.8750)  | last 100 mean =  63.39
500  reward =  138.0  | loss =  tensor(5961.4658)  | last 100 mean =  159.15
600  reward =  145.0  | loss =  tensor(7026.8066)  | last 100 mean =  150.11
700  reward =  137.0  | loss =  tensor(6488.4985)  | last 100 mean =  153.47
800  reward =  41.0  | loss =  tensor(750.6959)  | last 100 mean =  95.91
900  reward =  93.0  | loss =  tensor(3113.0166)  | last 100 mean =  75.91
1000  reward =  139.0  | loss =  tensor(6933.0977)  | last 100 mean =  106.52
1100  reward =  200.0  | loss =  tensor(9824.3594)  | last 100 mean =  183.03
AI won in episode  1119  with score  190.14
