In [None]:
import torch
from torch import nn
from torch.nn import functional as F
import torch.optim as optim

import gym
import numpy as np

## Steps to Implement VPG for cartpole:

1. Create a policy network for cartpole. Need the input size and ranges, output size and ranges, and network architecture.
2. Create value network. Can make this as a separate head of the policy network, so that they share the representation, but one head outputs the value and the other the policy.
3. Initialize the policy network to some sensible parameters.
4. Create the training loop:
    1. Run the set policy for $k$ encounters, and compute the return to go $\hat{R_t}$ for each encounter. (why do we calculate return to go and not the total return?)
    2. Compute the advantage estimates $\tilde{A_t}$. Advantage $\tilde{A_t} = \hat{R_t} - V_{\phi_k}$.
    3. Estimate the policy gradient as 
    $$
    \hat{g} = \sum_{t=0}^{T-1} \log \pi(a_t | s_t, \theta) \left(\sum_{t' = t}^{T-1} r_{t'} - V_{\phi_k}(s_t)\right)
    $$

### Create and inspect cartpole env

The specification of the cartpole environment is: 

    Box(4):
        Num	Observation                 Min         Max
        0	Cart Position             -4.8            4.8
        1	Cart Velocity             -Inf            Inf
        2	Pole Angle                 -24 deg        24 deg
        3	Pole Velocity At Tip      -Inf            Inf
        
    Actions:
        Type: Discrete(2)
        Num	Action
        0	Push cart to the left
        1	Push cart to the right
        
    Reward:
        Reward is 1 for every step taken, including the termination step.
        
    Starting State:
        All observations are assigned a uniform random value in [-0.05..0.05].
        
    Episode Termination:
        Pole Angle is more than 12 degrees
        Cart Position is more than 2.4 (center of the cart reaches the edge of the display)
        Episode length is greater than 200
        
    Solved Requirements:
        Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials.

In [None]:
env = gym.make('CartPole-v0')

In [None]:
env.observation_space

In [None]:
env.observation_space.shape

In [None]:
env.action_space

### Create Actor Critic Network from Environment Definition

In [None]:
in_size = env.observation_space.shape[0]
out_size = env.action_space.n
print(f'in={in_size}, out={out_size}')

Create a neural network that takes in the observations and outputs (1) the value of the state and (2) the policy values for each action at that state.

The neural network has two output heads, one for the policy and one for the value function. It has one hidden layer to encode the representation of the state:

in -> hidden(16) \
-> value(16) -> value-out\
-> $\pi$(16) -> $\pi$-out

In [None]:
class ActorCritic(nn.Module):
    
    def __init__(self, in_size, out_size):
        super(ActorCritic, self).__init__()
        self.fc = nn.Linear(in_size, 16)
        self.fc1 = nn.Linear(16, 16)
        self.fc2 = nn.Linear(16, 16)
        self.pi_out = nn.Linear(16, 1)
        
    def forward(self, x):
        """
        Output value of observation, and policy action.
        
        The output of policy is interpreted as the 
        probability that the action at state x is 0
        """
        x = F.relu(self.fc(x))
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
        policy = torch.sigmoid(self.pi_out(x))
        
        return policy

In [None]:
net = ActorCritic(in_size, out_size)
net

1. Collect rewards from the environment for each episode.
2. Collect a batch of episodes.
3. Act in the environment with neural network.
4. Write loss function for policy gradient.
5. 

In [None]:
import gym
import random
import numpy as np

env = gym.make('CartPole-v0')


def calc_loss(n):
    summations = []
    
    for i_episode in range(n):
        rewards = []
        log_pis = []
        observation = env.reset()
        done = False
        t = 0
        while not done:
            prob_1 = net(torch.Tensor(observation))
            
            action = 1 if random.random() <= prob_1 else 0
            
            log_pis.append(torch.log(prob_1 if action == 1 else 1 - prob_1))
            observation, reward, done, info = env.step(action)
            
            rewards.append(reward)
            
            t += 1

        episode_summation = 0
        #print(np.exp(log_pis))
        for i in range(len(rewards)):
            log_pi = log_pis[i]
            inner_sum = 0
            
            for j in range(i, len(rewards)):
                inner_sum += rewards[j]
            episode_summation += log_pi*inner_sum
        summations.append(episode_summation)
        
    return sum(summations)

In [None]:
np.exp(torch.log(torch.Tensor([0.7])))

In [None]:
net = ActorCritic(in_size, out_size)
optimizer = optim.Adam(net.parameters(), lr=0.01)
current_loss = 0
for x in range(300):
    
    optimizer.zero_grad()   # zero the gradient buffers
    loss = -calc_loss(5)
    loss.backward()
    current_loss += loss
    optimizer.step()    # Does the update
    if x % 100 == 0:
        print(x, current_loss/100.0)
        current_loss = 0
        
print(loss)

In [None]:
env = gym.make('CartPole-v0')
for i_episode in range(3):
    observation = env.reset()
    for t in range(1000):
        env.render()
        policy = net(torch.Tensor(observation))
        action = 1 if random.random() <= policy else 0
        observation, reward, done, info = env.step(action)
#         if t == 99:
#             print("FK not enough T")
#         if done:
#             break
    env.close()


In [None]:
print(net)

In [None]:
import gym
env = gym.make('CartPole-v0')
env.reset()
for _ in range(100):
    env.render()
    env.step(env.action_space.sample()) # take a random action
    print(env.action_space.sample())
env.close()
