In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('Algorithm.py'))))
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('Environment.py'))))
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('Replay_Buffer.py'))))
import Algorithm as ALGS
import Environment as ENVS
import Replay_Buffer


In [2]:
# import collections

# def initialize_buffer(config):
#     return SimpleExperienceBuffer(config['max_size'], config['batch_size'])

# class SimpleExperienceBuffer:
#     def __init__(self, capacity, batch_size):
#         self.buffer = collections.deque(maxlen=capacity)
#         self.batch_size = batch_size

#     def __len__(self):
#         return len(self.buffer)

#     def append(self, experience):
#         self.buffer.append(experience)

#     def sample(self):
#         indices = np.random.choice(len(self.buffer), self.batch_size)
#         states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
#         return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
#                np.array(dones, dtype=np.uint8), np.array(next_states)

# Implementation Example

## First initialize stuff

In [3]:
config ={'Learner': {'type': 'DDPG', 'episodes': 200}, 'Algorithm': {'algorithm': 'DDPG', 'replay_buffer': True, 'learning_rate': 0.003, 'optimizer': 'Adam', 'loss_function': 'MSELoss', 'regularizer': 0, 'recurrence': 0, 'gamma': 0.99, 'beta': 0, 'epsilon_start': 1, 'epsilon_end': 0.02, 'epsilon_decay': 5e-05, 'c': 200}, 'Environment': {'env_type': 'Gym', 'environment': 'MountainCar-v0', 'action_space': 'discrete', 'observation_space': 'discrete', 'env_render': True, 'num_agents': 1}, 'Replay_Buffer': {'max_size': 100000, 'batch_size': 64, 'num_agents': 1}, 'Agent': {'num_agents': 1}, 'Network': {'network_actor': {'layers': '400,300', 'activation_function': 'ReLU', 'output_function': 'Tanh', 'last_layer': True}, 'network_critic_head': {'layers': '400', 'activation_function': 'ReLU', 'output_function': '', 'last_layer': False}, 'network_critic_tail': {'layers': '300', 'activation_function': 'ReLU', 'output_function': '', 'last_layer': True}}}

In [4]:
config

{'Learner': {'type': 'DDPG', 'episodes': 200},
 'Algorithm': {'algorithm': 'DDPG',
  'replay_buffer': True,
  'learning_rate': 0.003,
  'optimizer': 'Adam',
  'loss_function': 'MSELoss',
  'regularizer': 0,
  'recurrence': 0,
  'gamma': 0.99,
  'beta': 0,
  'epsilon_start': 1,
  'epsilon_end': 0.02,
  'epsilon_decay': 5e-05,
  'c': 200},
 'Environment': {'env_type': 'Gym',
  'environment': 'MountainCar-v0',
  'action_space': 'discrete',
  'observation_space': 'discrete',
  'env_render': True,
  'num_agents': 1},
 'Replay_Buffer': {'max_size': 100000, 'batch_size': 64, 'num_agents': 1},
 'Agent': {'num_agents': 1},
 'Network': {'network_actor': {'layers': '400,300',
   'activation_function': 'ReLU',
   'output_function': 'Tanh',
   'last_layer': True},
  'network_critic_head': {'layers': '400',
   'activation_function': 'ReLU',
   'output_function': '',
   'last_layer': False},
  'network_critic_tail': {'layers': '300',
   'activation_function': 'ReLU',
   'output_function': '',
   'las

In [5]:
env = ENVS.initialize_env(config['Environment'])
env

<Environment.GymEnvironment at 0x7fc13db91ad0>

In [6]:
alg = ALGS.initialize_algorithm(env.get_observation_space(), env.get_action_space(), [config['Algorithm'], config['Agent'], config['Network']])
alg

<Algorithm.DDPGAlgorithm at 0x7fc13ce40890>

In [7]:
print (alg)

<Algorithm.DDPGAlgorithm object at 0x7fc13ce40890>


In [8]:
agent = alg.create_agent()
agent

<Agent.DDPGAgent at 0x7fc13ce40c50>

In [9]:
obs = env.get_observations()
obs

array([-0.43948549,  0.        ])

In [10]:
t = torch.tensor(obs)
t

tensor([-0.4395,  0.0000], dtype=torch.float64)

In [19]:
agent.critic.net_head

Sequential(
  (0): Linear(in_features=2, out_features=400, bias=True)
  (1): ReLU()
)

In [20]:
agent.target_critic.net_tail

Sequential(
  (0): Linear(in_features=403, out_features=300, bias=True)
  (1): ReLU()
  (2): Linear(in_features=300, out_features=1, bias=True)
)

In [13]:
agent.actor.net

Sequential(
  (0): Linear(in_features=2, out_features=400, bias=True)
  (1): ReLU()
  (2): Linear(in_features=400, out_features=300, bias=True)
  (3): ReLU()
  (4): Linear(in_features=300, out_features=3, bias=True)
  (5): Tanh()
)

In [14]:
agent.target_actor.net

Sequential(
  (0): Linear(in_features=2, out_features=400, bias=True)
  (1): ReLU()
  (2): Linear(in_features=400, out_features=300, bias=True)
  (3): ReLU()
  (4): Linear(in_features=300, out_features=3, bias=True)
  (5): Tanh()
)

In [15]:
alg.get_action(agent, t.float(), 1)

this is the action -> [ 0.15892069 -0.04422953  0.03120937]


  action = agent.actor(torch.tensor(observation).float()).data.numpy() + self.ou_noise.noise()


array([ 0.15892069, -0.04422953,  0.03120937])

In [16]:
buffer = Replay_Buffer.initialize_buffer(config['Replay_Buffer'], None, None, None)
buffer

<Replay_Buffer.SimpleExperienceBuffer at 0x7fc138129d50>

## Now simulate some training

In [17]:
num_of_episodes = 50

for i in range(num_of_episodes):
    obs = env.reset()
    obs = env.get_observation()
    done = False
    while not done:
        obs
        action = alg.get_action(agent, obs, i)
        next_obs, reward, done = env.step(action)
        
        experience = [obs, action, reward, done, next_obs]
        buffer.append(experience)
        
        experience = buffer.sample()
        alg.update(agent, experience, i)

        obs = next_obs

this is the action -> [ 0.1448858  -0.08321092  0.04393426]
tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False])


RuntimeError: size mismatch, m1: [1 x 64], m2: [2 x 400] at /pytorch/aten/src/TH/generic/THTensorMath.cpp:752

In [None]:
obs, actions, rewards, dones, next_obs = buffer.sample()
obs.shape, actions.shape, rewards.shape, dones.shape, next_obs.shape

In [None]:
def action2one_hot_v(action: int):
    z = torch.zeros(4)
    z[action] = 1
    return z

In [None]:
def action2one_hot(action: int):
    z = np.zeros(4)
    z[action] = 1
    return z

In [None]:
i = 1
act = action2one_hot(i)
type(act)

In [None]:
act2 = action2one_hot_v(i)
act2

In [None]:
states = [ np.random.rand(4) for i in range(4) ]
actions = [ action2one_hot(np.random.randint(4)) for i in range(4) ]

In [None]:
states

In [None]:
actions

In [None]:
input_v = torch.tensor([ np.concatenate([s_i, a_i]) for s_i, a_i in zip(states, actions) ])
input_v

In [None]:
net = torch.nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        )

In [None]:
state_action_val = net(input_v.float())

In [None]:
dones = [True, False, False, True]
done_mask = torch.ByteTensor(dones)#.to(device)
done_mask

In [None]:
state_action_val[done_mask] = 0.00
state_action_val