In [1]:
# Code adopted from:
# https://towardsdatascience.com/double-deep-q-networks-905dd8325412
# https://github.com/cyoon1729/deep-Q-networks/blob/master/doubleDQN/ddqn.py
# https://towardsdatascience.com/dqn-part-1-vanilla-deep-q-networks-6eb4a00febfb

# https://github.com/XinJingHao/DQN-DDQN-Pytorch/blob/main/DQN.py

# used gpt-3 as well for skeleton experimentation code

# https://goodboychan.github.io/python/reinforcement_learning/pytorch/udacity/2021/05/07/DQN-LunarLander.html

In [4]:
# Code adopted from
# https://github.com/chengxi600/RLStuff/blob/master/Policy%20Optimization%20Algorithms/PPO_Discrete.ipynb

In [2]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim
from torch.distributions import Categorical
import seaborn as sns
import gym
import random

from collections import deque 

In [None]:
#from tqdm import tqdm_notebook
#from collections import deque
#from copy import deepcopy

In [6]:
class ActorCriticNetwork(nn.Module):
    
    def __init__(self, in_dim, hidden_dim, out_dim):
        '''
        Args:
        - obs_space (int): observation space
        - action_space (int): action space
        
        '''
        super(ActorCriticNetwork, self).__init__()

        self.actor = nn.Sequential(
                            nn.Linear(in_dim, hidden_dim),
                            nn.Tanh(),
                            nn.Linear(hidden_dim, hidden_dim),
                            nn.Tanh(),
                            nn.Linear(hidden_dim, out_dim),
                            nn.Softmax(dim=1))


        self.critic = nn.Sequential(
                        nn.Linear(in_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, 1))
        
    def forward(self):
        ''' Not implemented since we call the individual actor and critc networks for forward pass
        '''
        raise NotImplementedError
        
    def select_action(self, state):
        ''' Selects an action given current state
        Args:
        - network (Torch NN): network to process state
        - state (Array): Array of action space in an environment

        Return:
        - (int): action that is selected
        - (float): log probability of selecting that action given state and network
        '''

        # Setup state
        state = torch.from_numpy(state).float().unsqueeze(0)

        # Action probabilities
        action_probs = self.actor(state)

        # Sample an action using the probability distribution
        m = Categorical(action_probs)
        action = m.sample()

        # Return action
        return action.item(), m.log_prob(action)
    
    def evaluate_action(self, states, actions):
        ''' Get log probability and entropy of an action taken in given state
        Args:
        - states (Array): array of states to be evaluated
        - actions (Array): array of actions to be evaluated
        
        '''
        
        # Convert state to float tensor, add 1 dimension, allocate tensor on device
        states = torch.stack([torch.from_numpy(state).float().unsqueeze(0) for state in states]).squeeze(1)

        # Use network to predict action probabilities
        action_probs = self.actor(states)

        # Get probability distribution
        m = Categorical(action_probs)

        #return log_prob and entropy
        return m.log_prob(torch.Tensor(actions)), m.entropy()
        

In [25]:
# Proximal Policy Optimization
class PPO_policy():
    
    def __init__(self, γ, ϵ, β, δ, c1, c2, k_epoch, obs_space, action_space, α_θ, αv, hidden_dim=64):
        '''
        Args:
        - γ (float): discount factor
        - ϵ (float): soft surrogate objective constraint
        - β (float): KL (Kullback–Leibler) penalty 
        - δ (float): KL divergence adaptive target
        - c1 (float): value loss weight
        - c2 (float): entropy weight
        - k_epoch (int): number of epochs to optimize
        - obs_space (int): observation space
        - action_space (int): action space
        - α_θ (float): actor learning rate
        - αv (float): critic learning rate
        
        '''
        self.γ = γ
        self.ϵ = ϵ
        self.β = β
        self.δ = δ
        self.c1 = c1
        self.c2 = c2
        self.k_epoch = k_epoch
        self.actor_critic = ActorCriticNetwork(obs_space, hidden_dim, action_space)
        self.optimizer = torch.optim.Adam([
            {'params': self.actor_critic.actor.parameters(), 'lr': α_θ},
            {'params': self.actor_critic.critic.parameters(), 'lr': αv}
        ])
        
        #buffer to store current batch
        self.batch = []

        self.loss_func = nn.MSELoss()
    
    def process_rewards(self, rewards, terminals):
        ''' Converts our rewards history into cumulative discounted rewards
        Args:
        - rewards (Array): array of rewards 

        Returns:
        - G (Array): array of cumulative discounted rewards
        '''
        #Calculate Gt (cumulative discounted rewards)
        G = []

        #track cumulative reward
        total_r = 0

        #iterate rewards from Gt to G0
        for r, done in zip(reversed(rewards), reversed(terminals)):

            #Base case: G(T) = r(T)
            #Recursive: G(t) = r(t) + G(t+1)^DISCOUNT
            total_r = r + total_r * self.γ

            #no future rewards if current step is terminal
            if done:
                total_r = r

            #add to front of G
            G.insert(0, total_r)

        #whitening rewards
        G = torch.tensor(G)
        G = (G - G.mean())/G.std()

        return G
    
    def kl_divergence(self, old_lps, new_lps):
        ''' Calculate distance between two distributions with KL divergence
        Args:
        - old_lps (Array): array of old policy log probabilities
        - new_lps (Array): array of new policy log probabilities
        '''
        
        #track kl divergence
        total = 0
        
        #sum up divergence for all actions
        for old_lp, new_lp in zip(old_lps, new_lps):
            
            #same as old_lp * log(old_prob/new_prob) cuz of log rules
            total += old_lp * (old_lp - new_lp)

        return total
    
    
    def penalty_update(self):
        ''' Update policy using surrogate objective with adaptive KL penalty
        '''
        
        #get items from current batch
        states = [sample[0] for sample in self.batch]
        actions = [sample[1] for sample in self.batch]
        rewards = [sample[2] for sample in self.batch]
        old_lps = [sample[3] for sample in self.batch]
        terminals = [sample[4] for sample in self.batch]

        #calculate cumulative discounted rewards
        Gt = self.process_rewards(rewards, terminals)

        #track divergence
        divergence = 0

        #perform k-epoch update
        for epoch in range(self.k_epoch):

            #get ratio
            new_lps, entropies = self.actor_critic.evaluate_action(states, actions)
            #same as new_prob / old_prob
            ratios = torch.exp(new_lps - torch.Tensor(old_lps))

            #compute advantages
            states_tensor = torch.stack([torch.from_numpy(state).float().unsqueeze(0) for state in states]).squeeze(1)
            vals = self.actor_critic.critic(states_tensor).squeeze(1).detach()
            advantages = Gt - vals

            #get loss with adaptive kl penalty
            divergence = self.kl_divergence(old_lps, new_lps).detach()
            loss = -ratios * advantages + self.β * divergence

            #SGD via Adam
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        #update adaptive penalty
        if divergence >= 1.5 * self.δ:
            self.β *= 2
        elif divergence <= self.δ / 1.5:
            self.β /= 2
        
        #clear batch buffer
        self.batch = []
            
    def clipped_update(self):
        ''' Update policy using clipped surrogate objective
        '''
        #get items from trajectory
        states = [sample[0] for sample in self.batch]
        actions = [sample[1] for sample in self.batch]
        rewards = [sample[2] for sample in self.batch]
        old_lps = [sample[3] for sample in self.batch]
        terminals = [sample[4] for sample in self.batch]

        #calculate cumulative discounted rewards
        Gt = self.process_rewards(rewards, terminals)

        #perform k-epoch update
        for epoch in range(self.k_epoch):

            #get ratio
            new_lps, entropies = self.actor_critic.evaluate_action(states, actions)

            ratios = torch.exp(new_lps - torch.Tensor(old_lps))

            #compute advantages
            states_tensor = torch.stack([torch.from_numpy(state).float().unsqueeze(0) for state in states]).squeeze(1)
            vals = self.actor_critic.critic(states_tensor).squeeze(1).detach()
            advantages = Gt - vals

            #clip surrogate objective
            surrogate1 = torch.clamp(ratios, min=1 - self.ϵ, max=1 + self.ϵ) * advantages
            surrogate2 = ratios * advantages

            #loss, flip signs since this is gradient descent
            loss =  -torch.min(surrogate1, surrogate2) + self.c1 * self.loss_func(Gt, vals) - self.c2 * entropies

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        
        #clear batch buffer
        self.batch = []

In [37]:
#Make environment
#env = gym.make('CartPole-v1')
env = gym.make('LunarLander-v2')

#seeds
np.random.seed(2)
torch.manual_seed(2)

#environment parameters
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

In [45]:
#Experiment Hyperparameters

#CartPole hyperparameters
ppo_policy = PPO_policy(γ=0.99, ϵ=0.2, β=1, δ=0.01, c1=0.5, c2=0.01, k_epoch=40, 
                        obs_space=obs_space, action_space=action_space, α_θ = 1e-3, αv = 1e-3, hidden_dim=64)
                        #obs_space=obs_space, action_space=action_space, α_θ = 0.0003, αv = 0.001, hidden_dim=64)

#number of steps to train
TRAIN_STEPS = 100000

#max steps per episode
MAX_STEPS = 400

#batch training size
BATCH_SIZE = 1600

#solved environment score
SOLVED_SCORE = 200

In [46]:
# Setup
scores = []
scores_window = deque(maxlen=100)
state, _ = env.reset()
curr_step = 0
score = 0

# Train
for step in range(1, TRAIN_STEPS):
    curr_step += 1
    action, lp = ppo_policy.actor_critic.select_action(state)
    next_state, reward, done, _, _ = env.step(action)
    score += reward
    ppo_policy.batch.append([state, action, reward, lp, done])

    # Optimize surrogate objective
    if step % BATCH_SIZE == 0:
        ppo_policy.clipped_update()
    
    if step % 2500 == 0:
        print('\rStep {}\tAverage Score: {:.2f}'.format(step, np.mean(scores_window)))
    if np.mean(scores_window)>=200.0:
        print('\nEnvironment solved in {:d} steps!\tAverage Score: {:.2f}'.format(step-2500, np.mean(scores_window)))
        #torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
        break

    # End episode
    if done or curr_step >= MAX_STEPS:
        state, _ = env.reset()
        curr_step = 0
        scores.append(score)
        scores_window.append(score)
        score = 0
        continue
        
    # Check if solved environment, early stopping
    if len(recent_scores) >= 100 and np.array(recent_scores).mean() >= SOLVED_SCORE:
        break

    # Set state
    state = next_state   

Step 2500	Average Score: -177.43
Step 5000	Average Score: -151.54
Step 7500	Average Score: -135.88
Step 10000	Average Score: -128.90
Step 12500	Average Score: -104.55
Step 15000	Average Score: -91.47
Step 17500	Average Score: -88.67
Step 20000	Average Score: -74.06
Step 22500	Average Score: -60.94
Step 25000	Average Score: -45.83
Step 27500	Average Score: -41.77
Step 30000	Average Score: -36.34
Step 32500	Average Score: -32.85
Step 35000	Average Score: -28.06
Step 37500	Average Score: -22.83
Step 40000	Average Score: -12.00
Step 42500	Average Score: -2.22
Step 45000	Average Score: -3.49
Step 47500	Average Score: 2.37
Step 50000	Average Score: 5.44
Step 52500	Average Score: 8.85
Step 55000	Average Score: 11.39
Step 57500	Average Score: 13.38
Step 60000	Average Score: 20.33
Step 62500	Average Score: 23.97
Step 65000	Average Score: 29.15
Step 67500	Average Score: 34.44
Step 70000	Average Score: 34.34
Step 72500	Average Score: 34.31
Step 75000	Average Score: 32.49
Step 77500	Average Score:

In [56]:
#CartPole hyperparameters
ppo_policy = PPO_policy(γ=0.99, ϵ=0.2, β=1, δ=0.01, c1=0.5, c2=0.01, k_epoch=40, 
                        obs_space=obs_space, action_space=action_space, α_θ = 1e-3, αv = 1e-3, hidden_dim=64)
                        #obs_space=obs_space, action_space=action_space, α_θ = 0.0003, αv = 0.001, hidden_dim=64)

#number of steps to train
max_episodes = 1000

#max steps per episode
max_steps = 500

#batch training size
#BATCH_SIZE = 1600
BATCH_SIZE = 256

#solved environment score
SOLVED_SCORE = 200

In [57]:
# Setup
scores = []
scores_window = deque(maxlen=100)
state, _ = env.reset()
curr_step = 0
score = 0

# Train
for episode in range(max_episodes):
    for step in range(max_steps):
        curr_step += 1
        action, lp = ppo_policy.actor_critic.select_action(state)
        next_state, reward, done, _, _ = env.step(action)
        score += reward
        ppo_policy.batch.append([state, action, reward, lp, done])
    
        # Optimize surrogate objective
        if step >= BATCH_SIZE:
            ppo_policy.clipped_update()

        # End episode
        if done:
            break
            
    state, _ = env.reset()
    curr_step = 0
    scores.append(score)
    scores_window.append(score)
    score = 0
            
    if episode % 100 == 0:
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_window)))
    if np.mean(scores_window)>=200.0:
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode-100, np.mean(scores_window)))
        #torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
        break

    # Check if solved environment, early stopping
    if len(recent_scores) >= 100 and np.array(recent_scores).mean() >= SOLVED_SCORE:
        break

    # Set state
    state = next_state   

Episode 0	Average Score: -143.16
Episode 100	Average Score: -192.48
Episode 200	Average Score: -203.06
Episode 300	Average Score: -171.09
Episode 400	Average Score: -150.24
Episode 500	Average Score: -192.54
Episode 600	Average Score: -203.55


KeyboardInterrupt: 