## Initializing Stuff 

In [1]:
import numpy as np
import pandas as pd
import gymnasium as gym

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm import tqdm

## Define the Actor and the  Critic Network

In [2]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(ActorCritic, self).__init__()

        self.num_actions = num_actions
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, 1)

        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions)
    
    def forward(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        value = F.relu(self.critic_linear1(state))
        value = self.critic_linear2(value)
        
        policy_dist = F.relu(self.actor_linear1(state))
        policy_dist = F.softmax(self.actor_linear2(policy_dist), dim=1)

        return value, policy_dist

## Define the Q-Actor-Critic Trainer

In [3]:
def train_q_actor_critic(
    env: gym.Env,
    actor_critic_net: ActorCritic,
    optimizer: optim.Optimizer,
    num_episodes: int = int(1e4),
    max_steps_per_episode: int = 500,
    disc_factor: float = .99,
    quiet: bool = False,
    cuda: bool = False
):
    num_outputs = env.action_space.n

    all_lengths = []
    average_lengths = []
    all_rewards = []
    average_rewards = []
    entropy_term = 0

    prog_bar = tqdm(range(num_episodes), desc='Training Episode', disable=quiet)
    for episode in prog_bar:
        log_probs = []
        values = []
        rewards = []

        state, _ = env.reset()
        for steps in range(max_steps_per_episode):
            value, policy_dist = actor_critic_net(state)
            value = value.detach().numpy()[0,0]
            dist = policy_dist.detach().numpy() 

            action = np.random.choice(num_outputs, p=np.squeeze(dist))
            log_prob = torch.log(policy_dist.squeeze(0)[action])
            entropy = -np.sum(np.mean(dist) * np.log(dist))
            new_state, reward, terminated, truncated, _ = env.step(action)

            rewards.append(reward)
            values.append(value)
            log_probs.append(log_prob)
            entropy_term += entropy
            state = new_state
            
            if terminated or truncated or (steps == max_steps_per_episode-1):
                Qval, _ = actor_critic_net(new_state)
                Qval = Qval.detach().numpy()[0,0]
                all_rewards.append(np.sum(rewards))
                all_lengths.append(steps)
                average_lengths.append(np.mean(all_lengths[-10:]))
                average_rewards.append(np.mean(all_rewards[-10:]))
                if episode % 10 == 0:
                    prog_bar.set_postfix_str(
                        f"reward: {np.sum(rewards)}, len: {steps}, "\
                        f"average reward: {average_rewards[-1]} , average len: {average_lengths[-1]}"
                    )
                break

        # compute Q values
        Qvals = np.zeros_like(values)
        for t in reversed(range(len(rewards))):
            Qval = rewards[t] + disc_factor * Qval
            Qvals[t] = Qval
  
        #update actor critic
        values = torch.FloatTensor(values)
        Qvals = torch.FloatTensor(Qvals)
        log_probs = torch.stack(log_probs)

        actor_loss = (-log_probs * Qvals).mean()
        critic_loss = 0.5 * (Qvals - values).pow(2).mean()
        ac_loss = actor_loss + critic_loss + 0.001 * entropy_term

        optimizer.zero_grad()
        ac_loss.backward()
        optimizer.step()


## Part 1: CartPole-v1

### Definitions

In [4]:
# Hyperparams
hidden_size = 16
learning_rate = 1e-3

cartenv = gym.make('CartPole-v1')

num_inputs = cartenv.observation_space.shape[0]
num_outputs = cartenv.action_space.n

actor_critic = ActorCritic(num_inputs, num_outputs, hidden_size)
optimizer = optim.Adam(actor_critic.parameters(), lr=learning_rate)

### Train the agent

In [5]:
# train_q_actor_critic(cartenv, actor_critic, optimizer, int(2.048e3))

In [6]:
# torch.save(actor_critic,'m44_saileshr_assignment3_part1_qac_cartpolev1.pth')

## Part 2.1: LunarLander-v2

In [7]:
# Hyperparams
hidden_size = 32
learning_rate = 3e-3

lunarenv = gym.make("LunarLander-v2")

num_inputs = lunarenv.observation_space.shape[0]
num_outputs = lunarenv.action_space.n

actor_critic = ActorCritic(num_inputs, num_outputs, hidden_size)
optimizer = optim.Adam(actor_critic.parameters(), lr=learning_rate)

In [8]:
train_q_actor_critic(lunarenv, actor_critic, optimizer, int(1e4), max_steps_per_episode=1000)

Training Episode: 100%|██████████| 10000/10000 [50:04<00:00,  3.33it/s, reward: 203.5905115511382, len: 409, average reward: 190.57264875405946 , average len: 649.3]   


In [9]:
# torch.save(actor_critic,'m44_saileshr_assignment3_part1_qac_lunarlander.pth')