In [1]:
!pip install gym[classic_control]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame==2.1.0 (from gym[classic_control])
  Downloading pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m102.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygame
  Attempting uninstall: pygame
    Found existing installation: pygame 2.3.0
    Uninstalling pygame-2.3.0:
      Successfully uninstalled pygame-2.3.0
Successfully installed pygame-2.1.0


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from collections import deque
import gym

In [3]:
## Definining SQL Architecture

class Soft_Q_Net(nn.Module):
  def __init__(self, observation_dim, action_dim, alpha):
    super(Soft_Q_Net, self).__init__()
    self.alpha = alpha
    self.observation_dim = observation_dim
    self.action_dim = action_dim
    # define network architecture
    self.FC1 = nn.Linear(self.observation_dim, 64)    # input layer
    self.FC2 = nn.Linear(64, 256)                     # hidden layer
    self.FC3 = nn.Linear(256, self.action_dim)        # output layer

  # network connecting
  def forward_pass(self, observation):
    x = self.FC1(observation)
    x = F.relu(x)
    x = self.FC2(x)
    x = F.relu(x)
    x = self.FC3(x)
    return x

  # selecting action
  def act(self, observation):
    with torch.no_grad():
      q_val = self.forward_pass(observation)                        # estimating soft Q value function
      v_val = self.get_V_val(q_val)                                  # estimating soft V value function
      pi_maxent = torch.exp((q_val - v_val) / self.alpha)
      pi_maxent = pi_maxent / pi_maxent.sum(dim=-1, keepdim=True)
      pi_maxent[torch.isnan(pi_maxent)] = 1e-6                      # set pi_maxent value to a small value incase numerical instability (Avoid output 'NaN')
      policy_dist = torch.distributions.Categorical(pi_maxent)
      action = policy_dist.sample().item()                          # sample an action from Categorical distribution
    return q_val, v_val, pi_maxent, action

  # defining soft V value function
  def get_V_val(self, q_value):
    v = self.alpha * torch.log((1 / self.alpha * q_value).exp().sum(dim=-1, keepdim=True))
    return v

In [4]:
class buffer_memory(object):
  # initializing buffer memory using deque
  def __init__(self, memory_size):
    self.memory_size = memory_size
    self.memory = deque(maxlen=self.memory_size)

  # to store state, action and reward onto buffer
  def store(self, observation, action, reward, next_observation, done):
    observation = np.expand_dims(observation, 0)
    next_observation = np.expand_dims(next_observation, 0)
    self.memory.append([observation, action, reward, next_observation, done])

  # to sample for training
  def sample(self, batch_size):
    sampl_batch = random.sample(self.memory, batch_size)                          # randomly sample set of state measures
    observations, actions, rewards, next_observations, dones = zip(*sampl_batch)
    observations = np.concatenate(observations, 0)                                # convert single state 2d obs array into batch size of 2d array
    next_observations = np.concatenate(next_observations, 0)
    return observations, actions, rewards, next_observations, dones



In [5]:
def train(buffer, target_model, eval_model, gamma, optimizer, batch_size, loss_fn, count, update_freq):
  # collect a batch of random samples
  observations, actions, rewards, next_observations, dones = buffer.sample(batch_size)

  # convert each arrays to tensor type
  observations = torch.FloatTensor(observations)
  actions = torch.LongTensor(actions)
  rewards = torch.FloatTensor(rewards)
  next_observations = torch.FloatTensor(next_observations)
  dones = torch.FloatTensor(dones)

  q_vals = eval_model.forward_pass(observations)                                      # get Qt values (2d tensor of two element) for all observation (observation is 2d) from eval model
  next_q_vals = target_model.forward_pass(next_observations)                          # get Qt+1 values for all next observation from target model
  next_v_vals = target_model.get_V_val(next_q_vals)                                        # get Vt+1 values (2d tensor of single element) for all Qt+1 from target model

  q_vals = q_vals.gather(1, actions.unsqueeze(1)).squeeze(1)                          # get back Vt values of all corresponding action
  expected_q_vals = rewards + gamma * (1 - dones) * next_v_vals.squeeze(-1)

  # calculate loss
  loss = (expected_q_vals.detach() - q_vals).pow(2)
  loss = loss.mean()

  optimizer.zero_grad()         # set eval_model gradient to none
  loss.backward()               # computes the gradient w.r.t loss
  optimizer.step()              # Performs a single optimization step (parameter update)

  if count % update_freq == 0:  # update target model for every 200 steps by sharing the params of eval model
    target_model.load_state_dict(eval_model.state_dict())

  return loss

In [7]:
## start here

gamma = 0.99             # discount rate
learning_rate = 0.0001   # learning rate
batch_size = 32          # training batch size
update_freq = 200        # update target network for every 200 steps (after every 200 state)
capacity = 5000   # size of buffer memory
render = False           # renedering of cartpole window
episode = 500    # Total episode
alpha = 4                # entropy/temperature coefficient

env = gym.make('CartPole-v1')
env = env.unwrapped
observation_dim = env.observation_space.shape[0]    # State space size: 4
action_dim = env.action_space.n                     # Action space size: 2
print(observation_dim, '||', action_dim)

target_net = Soft_Q_Net(observation_dim, action_dim, alpha)   # initializing target nn
eval_net = Soft_Q_Net(observation_dim, action_dim, alpha)     # initializing evaluation nn
eval_net.load_state_dict(target_net.state_dict())             # loading initialized params (weights and biases) of target nn to eval nn
#print(target_net.state_dict()['FC1.weight'].shape)
#print(target_net.state_dict()['FC2.weight'].shape)

optimizer = torch.optim.Adam(eval_net.parameters(), lr=learning_rate)   # optimizer
buffer = buffer_memory(capacity)                                        # initialize buffer memory
loss_fn = nn.MSELoss()                                                  # to measures the mean squared error

count = 0
weight_reward = None

for i in range(episode):
  # within each episode
  obs = env.reset()     # get initial state observation from env
  reward_total = 0      # total reward got in a episode
  if render:
    env.render()
  while True:
    obs_2d_tensor = torch.FloatTensor(np.expand_dims(obs, 0))               # convert 1d array to 2d array then to float tensor
    q_val, v_val, pi_maxent, action = eval_net.act(obs_2d_tensor)           # selecting action from agent sampling distribution
    #print(q_value, '||', v, '||', pi_maxent, '||', dist, '||', action, '\nTotal reward: ', reward_total, 'count: ', count, '\n-------------------------------------\n')
    count += 1                                                              # indicate number of state agent covered till present
    next_obs, reward, done, info, _ = env.step(action)                      # taking sampled action on environment
    buffer.store(obs, action, reward, next_obs, done)                       # storing the st, at, rt+1, st+1 into buffer
    reward_total += reward                                                  # incrementing episode reward with current state reward
    obs = next_obs                                                          # set next state as current state
    if render:
      env.render()
    if len(buffer.memory) > batch_size:                                     # if buffer have more new samples than batch size (32); trainig will be done
      loss = train(buffer, target_net, eval_net, gamma, optimizer, batch_size, loss_fn, count, update_freq)
    if done:
      if not weight_reward:
        weight_reward = reward_total
      else:
        weight_reward = 0.99 * weight_reward + 0.01 * reward_total          # a relative current episode reward with past episodes reward
      if (i+1) % 10 == 0:
        print('episode: {}\treward: {}\tweight_reward: {:.3f}\tepisode loss: {:.3f}'.format(i+1, reward_total, weight_reward, loss))
      break


4 || 2
episode: 10	reward: 18.0	weight_reward: 16.647	episode loss: 2.866
episode: 20	reward: 14.0	weight_reward: 17.944	episode loss: 7.887
episode: 30	reward: 12.0	weight_reward: 18.242	episode loss: 16.446
episode: 40	reward: 27.0	weight_reward: 18.093	episode loss: 22.876
episode: 50	reward: 45.0	weight_reward: 18.670	episode loss: 30.261
episode: 60	reward: 20.0	weight_reward: 19.122	episode loss: 27.133
episode: 70	reward: 10.0	weight_reward: 19.552	episode loss: 43.906
episode: 80	reward: 22.0	weight_reward: 19.807	episode loss: 17.019
episode: 90	reward: 23.0	weight_reward: 19.682	episode loss: 2.126
episode: 100	reward: 56.0	weight_reward: 21.004	episode loss: 10.332
episode: 110	reward: 24.0	weight_reward: 22.387	episode loss: 98.754
episode: 120	reward: 22.0	weight_reward: 23.663	episode loss: 4.418
episode: 130	reward: 36.0	weight_reward: 24.187	episode loss: 46.142
episode: 140	reward: 26.0	weight_reward: 25.934	episode loss: 52.887
episode: 150	reward: 13.0	weight_reward: