In [1]:
!pip install gym[classic_control]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame==2.1.0 (from gym[classic_control])
  Downloading pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygame
  Attempting uninstall: pygame
    Found existing installation: pygame 2.3.0
    Uninstalling pygame-2.3.0:
      Successfully uninstalled pygame-2.3.0
Successfully installed pygame-2.1.0


In [None]:
import numpy as np
import tensorflow as tf
import gym

In [None]:
import torch

In [5]:
class SoftQAgent:
    def __init__(self, env, alpha=0.1, gamma=0.99, tau=0.01):
        self.env = env
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor
        self.tau = tau  # soft target update rate
        self.obs_dim = env.observation_space.shape[0]
        if isinstance(env.action_space, gym.spaces.Discrete):
            self.act_dim = env.action_space.n
        else:
            self.act_dim = env.action_space.shape[0]

        #self.act_dim = env.action_space.shape[0]
        self.hid1_dim = 64
        self.hid2_dim = 64
        self.batch_size = 64
        self.replay_buffer = []
        self.q1_network = self.build_network()
        self.q2_network = self.build_network()
        self.q1_target_network = self.build_network()
        self.q2_target_network = self.build_network()
        self.copy_network_weights(self.q1_network, self.q1_target_network)
        self.copy_network_weights(self.q2_network, self.q2_target_network)

    # Build the Q-value network
    def build_network(self):
        inputs = tf.keras.layers.Input(shape=(self.obs_dim,))
        hid1 = tf.keras.layers.Dense(self.hid1_dim, activation='relu')(inputs)
        hid2 = tf.keras.layers.Dense(self.hid2_dim, activation='relu')(hid1)
        q_values = tf.keras.layers.Dense(self.act_dim, activation=None)(hid2)
        model = tf.keras.models.Model(inputs=inputs, outputs=q_values)
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.alpha))
        return model

    def get_action(self, obs):
        """
        Returns an action for a given observation using the current policy.
        """
        with torch.no_grad():
            obs = torch.FloatTensor(obs).unsqueeze(0)
            q1, q2 = self.q1(obs), self.q2(obs)
            q = torch.min(q1, q2)
            _, argmax = torch.max(q, dim=1)
            action = argmax.item()
        return action

    # Copy the weights from one network to another
    def copy_network_weights(self, source_network, target_network):
        target_network.set_weights(source_network.get_weights())

    # Update the Q-value networks using the soft Q-learning algorithm
    def update_networks(self):
        # Sample a batch of transitions from the replay buffer
        batch = np.array(self.replay_buffer)[np.random.choice(len(self.replay_buffer), self.batch_size, replace=False)]
        obs = batch[:, 0]
        act = batch[:, 1]
        rew = batch[:, 2]
        next_obs = batch[:, 3]
        done = batch[:, 4]

        # Compute the target Q-values using the soft Bellman equation
        next_q1_values = self.q1_target_network.predict(next_obs)
        next_q2_values = self.q2_target_network.predict(next_obs)
        next_q_values = np.minimum(next_q1_values, next_q2_values)
        target_q_values = rew + self.gamma * (1 - done) * (next_q_values - self.tau * np.log(next_q_values))

        # Update the Q-value networks
        self.q1_network.fit(obs, target_q_values, verbose=0)
        self.q2_network.fit(obs, target_q_values, verbose=0)

        # Update the target Q-value networks using a soft update
        q1_weights = np.array(self.q1_network.get_weights())
        q1_target_weights = np.array(self.q1_target_network.get_weights())
        q1_target_weights = self.tau * q1_weights + (1 - self.tau) * q1_target_weights
        self.q1_target_network.set_weights(q1_target_weights)

        q2_weights = np.array(self.q2_network.get_weights())
        q2_target_weights = np.array(self.q2_target_network.get_weights())
        q2_target_weights = self.tau * q2_weights + (1 - self.tau) * q2_target_weights
        self.q2_target_network.set_weights(q2_target_weights)


In [None]:
import gym

# Create the environment
env = gym.make('CartPole-v1')

# Create the Soft Q-learning agent
agent = SoftQAgent(env)

# Train the agent
num_episodes = 1000
for episode in range(num_episodes):
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        # Choose an action using the Q-value network and add some exploration noise
        action = agent.q1_network.predict(obs[None])[0]
        action += np.random.randn(agent.act_dim) * 0.1
        action = np.clip(action, env.action_space.low, env.action_space.high)

        # Take the chosen action and observe the next state and reward
        next_obs, rew, done, _ = env.step(action)

        # Add the transition to the replay buffer
        agent.replay_buffer.append((obs, action, rew, next_obs, done))

        # Update the Q-value networks and target networks
        agent.update_networks()

        # Update the current observation and total reward
        obs = next_obs
        total_reward += rew

    # Print the total reward for the episode
    print(f'Episode {episode + 1}: Total reward = {total_reward}')

# Test the agent
obs = env.reset()
done = False
total_reward = 0
while not done:
    # Choose the action with the highest Q-value
    action = agent.q1_network.predict(obs[None])[0]
    obs, rew, done, _ = env.step(action)
    total_reward += rew
env.close()
print(f'Test reward = {total_reward}')



AttributeError: ignored

This code will train the SoftQAgent on the CartPole-v1 environment for 1000 episodes and then test the trained agent. The output should show the total reward for each episode during training and the test reward at the end. The trained agent should be able to achieve a high test reward, indicating that it has learned a good policy for the CartPole-v1 environment.

In [None]:
import gym
import numpy as np
from collections import deque

# Import the SoftQAgent class

# Create the environment
env = gym.make('CartPole-v1')

# Create the Soft Q-learning agent
agent = SoftQAgent(env)

# Set up the training loop
num_episodes = 1000
max_steps = 200
score_history = deque(maxlen=100)

# Train the agent
for i in range(num_episodes):
    obs = env.reset()
    score = 0
    for t in range(max_steps):
        # Choose an action using the Soft Q-learning policy
        action = agent.get_action(obs)

        # Take a step in the environment
        next_obs, reward, done, info = env.step(action)

        # Add the transition to the replay buffer
        agent.replay_buffer.append((obs, action, reward, next_obs, done))

        # Update the Q-value networks
        if len(agent.replay_buffer) >= agent.batch_size:
            agent.update_networks()

        # Update the score and observation
        score += reward
        obs = next_obs

        # Check if the episode is done
        if done:
            break

    # Add the score to the score history
    score_history.append(score)

    # Print the episode score and average score over the last 100 episodes
    print(f"Episode {i}: score = {score}, average score = {np.mean(score_history)}")


  deprecation(
  deprecation(


AttributeError: ignored

In [None]:
env.observation_space.shape

(4,)

In [None]:
import pygame

  and should_run_async(code)


In [None]:
pygame.init()
print(pygame.display.list_modes())

error: ignored

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from collections import deque
import gym


class replay_buffer(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=self.capacity)

    def store(self, observation, action, reward, next_observation, done):
        observation = np.expand_dims(observation, 0)
        next_observation = np.expand_dims(next_observation, 0)
        self.memory.append([observation, action, reward, next_observation, done])

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        observations, actions, rewards, next_observations, dones = zip(* batch)
        return np.concatenate(observations, 0), actions, rewards, np.concatenate(next_observations, 0), dones

    def __len__(self):
        return len(self.memory)


class soft_q_net(nn.Module):
    def __init__(self, observation_dim, action_dim, alpha):
        super(soft_q_net, self).__init__()
        self.observation_dim = observation_dim
        self.action_dim = action_dim
        self.alpha = alpha
        self.fc1 = nn.Linear(self.observation_dim, 64)
        self.fc2 = nn.Linear(64, 256)
        self.fc3 = nn.Linear(256, self.action_dim)

    def forward(self, observation):
        x = self.fc1(observation)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

    def act(self, observation, i):
        with torch.no_grad():
            q_value = self.forward(observation)
            v = self.getV(q_value)
            pi_maxent = torch.exp((q_value - v) / self.alpha)
            pi_maxent = pi_maxent / pi_maxent.sum(dim=-1, keepdim=True)

            if pi_maxent[0][0]==float('nan') or pi_maxent[0][1]==float('nan'):
              print(q_value)
              #print('q_value: ', q_value, '\nv: ', v, '\npi_maxent: ', pi_maxent)
              act1 = np.random.uniform(0, 1)
              act2 = 1 - act1
              pi_maxent[0][0] = act1
              pi_maxent[0][1] = act2
              dist = torch.distributions.Categorical(pi_maxent)
            else:
              dist = torch.distributions.Categorical(pi_maxent)

            action = dist.sample().item()
        return action

    def getV(self, q_value):
        v = self.alpha * torch.log((1 / self.alpha * q_value).exp().sum(dim=-1, keepdim=True))
        return v


def train(buffer, target_model, eval_model, gamma, optimizer, batch_size, loss_fn, count, update_freq):
    observation, action, reward, next_observation, done = buffer.sample(batch_size)

    observation = torch.FloatTensor(observation)
    action = torch.LongTensor(action)
    reward = torch.FloatTensor(reward)
    next_observation = torch.FloatTensor(next_observation)
    done = torch.FloatTensor(done)

    q_values = eval_model.forward(observation)
    next_q_values = target_model.forward(next_observation)
    next_v_values = target_model.getV(next_q_values)
    q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    expected_q_value = reward + gamma * (1 - done) * next_v_values.squeeze(-1)

    #loss = loss_fn(q_value, expected_q_value.detach())
    loss = (expected_q_value.detach() - q_value).pow(2)
    loss = loss.mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if count % update_freq == 0:
        target_model.load_state_dict(eval_model.state_dict())


In [2]:
if __name__ == '__main__':
    gamma = 0.99           # discount rate
    learning_rate = 1e-4   # learning rate
    batch_size = 32
    update_freq = 200
    capacity = 5000 #50000
    render = False
    episode = 500 #100000
    alpha = 4

    env = gym.make('CartPole-v0')
    env = env.unwrapped
    observation_dim = env.observation_space.shape[0]    # size 4
    action_dim = env.action_space.n   # size 2

    target_net = soft_q_net(observation_dim, action_dim, alpha)   # initializing target nn
    eval_net = soft_q_net(observation_dim, action_dim, alpha)   # initializing evaluation nn
    eval_net.load_state_dict(target_net.state_dict())    # loading initialized params (weights and biases) of target nn to eval nn

    optimizer = torch.optim.Adam(eval_net.parameters(), lr=learning_rate)   # optimizer
    buffer = replay_buffer(capacity)
    loss_fn = nn.MSELoss()
    count = 0

    weight_reward = None
    for i in range(episode):
        obs = env.reset()
        reward_total = 0
        if render:
            env.render()
        while True:
            action = eval_net.act(torch.FloatTensor(np.expand_dims(obs, 0)), i)
            count += 1
            next_obs, reward, done, info, _ = env.step(action)
            buffer.store(obs, action, reward, next_obs, done)
            reward_total += reward
            obs = next_obs
            if render:
                env.render()
            if len(buffer.memory) > batch_size:
                train(buffer, target_net, eval_net, gamma, optimizer, batch_size, loss_fn, count, update_freq)

            if done:
                if not weight_reward:
                    weight_reward = reward_total
                else:
                    weight_reward = 0.99 * weight_reward + 0.01 * reward_total
                if (i+1) % 10 == 0:
                    print('episode: {}\treward: {}\tweight_reward: {:.3f}'.format(i+1, reward_total, weight_reward))
                break


  logger.warn(
  deprecation(
  deprecation(


episode: 10	reward: 25.0	weight_reward: 68.925
episode: 20	reward: 13.0	weight_reward: 64.342
episode: 30	reward: 20.0	weight_reward: 60.326
episode: 40	reward: 31.0	weight_reward: 56.468
episode: 50	reward: 15.0	weight_reward: 53.019
episode: 60	reward: 32.0	weight_reward: 49.515
episode: 70	reward: 23.0	weight_reward: 46.321
episode: 80	reward: 15.0	weight_reward: 43.614
episode: 90	reward: 30.0	weight_reward: 41.744
episode: 100	reward: 18.0	weight_reward: 40.667
episode: 110	reward: 15.0	weight_reward: 39.465
episode: 120	reward: 44.0	weight_reward: 38.662
episode: 130	reward: 23.0	weight_reward: 38.616
episode: 140	reward: 40.0	weight_reward: 38.103
episode: 150	reward: 52.0	weight_reward: 38.724
episode: 160	reward: 31.0	weight_reward: 42.145
episode: 170	reward: 101.0	weight_reward: 44.296
episode: 180	reward: 65.0	weight_reward: 49.003
episode: 190	reward: 144.0	weight_reward: 57.633
episode: 200	reward: 106.0	weight_reward: 62.742
episode: 210	reward: 134.0	weight_reward: 67.7

ValueError: ignored

In [3]:
obs_ = env.reset()

  and should_run_async(code)


In [6]:
q_values = target_net.forward(torch.FloatTensor(np.expand_dims(obs_, 0)))

In [7]:
q_values

tensor([[342.9336, 340.2656]], grad_fn=<AddmmBackward0>)

In [7]:

gamma = 0.99           # discount rate
learning_rate = 1e-4   # learning rate
batch_size = 32
update_freq = 200
capacity = 50000
render = False
episode = 100 #100000
alpha = 4

env = gym.make('CartPole-v0')
env = env.unwrapped
observation_dim = env.observation_space.shape[0]    # size 4
action_dim = env.action_space.n   # size 2

target_net = soft_q_net(observation_dim, action_dim, alpha)   # initializing target nn
eval_net = soft_q_net(observation_dim, action_dim, alpha)   # initializing evaluation nn
eval_net.load_state_dict(target_net.state_dict())    # loading initialized params (weights and biases) of target nn to eval nn

optimizer = torch.optim.Adam(eval_net.parameters(), lr=learning_rate)   # optimizer
buffer = replay_buffer(capacity)
loss_fn = nn.MSELoss()
count = 0

weight_reward = None
for i in range(episode):
    obs = env.reset()
    reward_total = 0
    if render:
        env.render()
    while True:
        action = eval_net.act(torch.FloatTensor(np.expand_dims(obs, 0)), i)
        count += 1
        next_obs, reward, done, info, _ = env.step(action)
        buffer.store(obs, action, reward, next_obs, done)
        reward_total += reward
        obs = next_obs
        if render:
            env.render()
        if len(buffer.memory) > batch_size:
            train(buffer, target_net, eval_net, gamma, optimizer, batch_size, loss_fn, count, update_freq)

        if done:
            if not weight_reward:
                weight_reward = reward_total
            else:
                weight_reward = 0.99 * weight_reward + 0.01 * reward_total
            if (i+1) % 10 == 0:
                print('episode: {}\treward: {}\tweight_reward: {:.3f}'.format(i+1, reward_total, weight_reward))
            break

episode: 10	reward: 14.0	weight_reward: 14.398
episode: 20	reward: 21.0	weight_reward: 14.642
episode: 30	reward: 24.0	weight_reward: 15.254
episode: 40	reward: 14.0	weight_reward: 15.740
episode: 50	reward: 12.0	weight_reward: 16.231
episode: 60	reward: 18.0	weight_reward: 17.174
episode: 70	reward: 33.0	weight_reward: 17.606
episode: 80	reward: 19.0	weight_reward: 17.399
episode: 90	reward: 60.0	weight_reward: 18.595
episode: 100	reward: 20.0	weight_reward: 19.141


In [None]:
target_net.

In [None]:
episode: 320	reward: 213.0	weight_reward: 209.849

episode: 710	reward: 5240.0	weight_reward: 2442.077

In [None]:
V.dim()

2

In [None]:
np.random.uniform(0, 1)

0.46622402653910955

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from collections import deque
import gym

In [2]:
class replay_buffer(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=self.capacity)

    def store(self, observation, action, reward, next_observation, done):
        observation = np.expand_dims(observation, 0)
        next_observation = np.expand_dims(next_observation, 0)
        self.memory.append([observation, action, reward, next_observation, done])

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        observations, actions, rewards, next_observations, dones = zip(* batch)
        return np.concatenate(observations, 0), actions, rewards, np.concatenate(next_observations, 0), dones

    def __len__(self):
        return len(self.memory)

In [None]:
deque?

In [3]:
class soft_q_net(nn.Module):
    def __init__(self, observation_dim, action_dim, alpha):
        super(soft_q_net, self).__init__()
        self.observation_dim = observation_dim
        self.action_dim = action_dim
        self.alpha = alpha
        self.fc1 = nn.Linear(self.observation_dim, 64)
        self.fc2 = nn.Linear(64, 256)
        self.fc3 = nn.Linear(256, self.action_dim)

    def forward(self, observation):
        x = self.fc1(observation)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

    def act(self, observation):
        with torch.no_grad():
            q_value = self.forward(observation)
            v = self.getV(q_value)
            pi_maxent = torch.exp((q_value - v) / self.alpha)
            pi_maxent_ = pi_maxent / pi_maxent.sum(dim=-1, keepdim=True)
            dist = torch.distributions.Categorical(pi_maxent_)

            action = dist.sample().item()
        return q_value, v, pi_maxent, pi_maxent_, dist, action

    def getV(self, q_value):
        v = self.alpha * torch.log((1 / self.alpha * q_value).exp().sum(dim=-1, keepdim=True))
        return v

In [None]:
torch.no_grad?

In [3]:
gamma = 0.99           # discount rate
learning_rate = 1e-4   # learning rate
batch_size = 32
update_freq = 200
capacity = 50000
render = False
episode = 10 #100000
alpha = 4

In [4]:
env = gym.make('CartPole-v0')
env = env.unwrapped
observation_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

  logger.warn(
  deprecation(
  deprecation(


In [None]:
target_net = soft_q_net(observation_dim, action_dim, alpha)
eval_net = soft_q_net(observation_dim, action_dim, alpha)

In [None]:
target_net?

In [None]:
len(target_net.state_dict())

6

In [None]:
target_net.state_dict()['fc1.weight'].shape

  and should_run_async(code)


torch.Size([64, 4])

In [None]:
target_net.state_dict()['fc1.bias'].shape

torch.Size([64])

In [None]:
target_net.state_dict()['fc3.weight'].shape

torch.Size([2, 256])

In [None]:
target_net.state_dict()['fc3.weight']

In [None]:
eval_net.load_state_dict(target_net.state_dict())
#eval_net.state_dict()['fc3.weight']

<All keys matched successfully>

In [None]:
for params in eval_net.parameters():
  print(params.size())

torch.Size([64, 4])
torch.Size([64])
torch.Size([256, 64])
torch.Size([256])
torch.Size([2, 256])
torch.Size([2])


  and should_run_async(code)


In [None]:
optimizer = torch.optim.Adam(eval_net.parameters(), lr=learning_rate)

In [None]:
buffer = replay_buffer(capacity)
loss_fn = nn.MSELoss()
count = 0

In [None]:
env.observation_space

  and should_run_async(code)


Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [None]:
env.action_space

Discrete(2)

In [None]:
np.expand_dims(obs, 0)

NameError: ignored

In [None]:
env.step?

In [None]:
weight_reward = None
for i in range(episode)[:1]:
    obs = env.reset()
    reward_total = 0
    if render:
      env.render()
    while True:
      q_value, v, pi_maxent, pi_maxent_, dist, action = eval_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
      print(q_value, '||', v, '||', pi_maxent, '||', pi_maxent_, '||', dist, '||', action, '\nTotal reward: ', reward_total, 'count: ', count, '\n-------------------------------------\n') 
      count += 1
      next_obs, reward, terminated, truncated, info = env.step(action)
      buffer.store(obs, action, reward, next_obs, terminated)
      reward_total += reward
      obs = next_obs    
      if terminated:
        break
      print(count)

q value:  tensor([[0.0759, 0.0228]])
tensor([[0.0759, 0.0228]]) || tensor([[2.8220]]) || tensor([[0.5033, 0.4967]]) || tensor([[0.5033, 0.4967]]) || Categorical(probs: torch.Size([1, 2])) || 1 
Total reward:  0 count:  1 
-------------------------------------

2
q value:  tensor([[ 0.1231, -0.0118]])
tensor([[ 0.1231, -0.0118]]) || tensor([[2.8288]]) || tensor([[0.5084, 0.4916]]) || tensor([[0.5084, 0.4916]]) || Categorical(probs: torch.Size([1, 2])) || 1 
Total reward:  1.0 count:  2 
-------------------------------------

3
q value:  tensor([[ 0.1731, -0.0454]])
tensor([[ 0.1731, -0.0454]]) || tensor([[2.8379]]) || tensor([[0.5137, 0.4863]]) || tensor([[0.5137, 0.4863]]) || Categorical(probs: torch.Size([1, 2])) || 1 
Total reward:  2.0 count:  3 
-------------------------------------

4
q value:  tensor([[ 0.2177, -0.0717]])
tensor([[ 0.2177, -0.0717]]) || tensor([[2.8482]]) || tensor([[0.5181, 0.4819]]) || tensor([[0.5181, 0.4819]]) || Categorical(probs: torch.Size([1, 2])) || 0 
T

In [None]:
# both pi_maxent and pi_maxent_ are same

In [None]:
count += 1
            next_obs, reward, done, info, _ = env.step(action)
            buffer.store(obs, action, reward, next_obs, done)
            reward_total += reward
            obs = next_obs

In [None]:
env.render?