In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import math
from DRQN.network import ADRQN
from DRQN.experience_replay import ExpBuffer
from DRQN.agent import Agent
from DRQN.epsilon_greedy_strategy import EpsilonGreedyStrategy
from DRQN.plot import plot, get_moving_average

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%run ./env.ipynb

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
def save(model, path):
    torch.save(model.state_dict(), path)    

def load(path,n_actions,state_size, embedding_size):
    model = ADRQN(n_actions,state_size,embedding_size)
    model.load_state_dict(torch.load(path))
    return model   


In [3]:
env = EnvManager(device)
state_size = env.num_state_features()
n_actions = env.num_actions_available()

In [4]:
embedding_size = 8
M_episodes = 100
memory_size = 1000
sample_length = 3
replay_buffer = ExpBuffer(memory_size, sample_length)
batch_size = 8
eps_start = 0.9
eps = eps_start
eps_end = 0.05
eps_decay = 10
gamma = 0.999
learning_rate = 0.001
blind_prob = 0
EXPLORE = 50
target_update = 5  # Interval for updating the target net

In [5]:
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)
agent = Agent(strategy, n_actions, device)
adrqn = ADRQN(n_actions, state_size, embedding_size).cuda()
adrqn_target = ADRQN(n_actions, state_size, embedding_size).cuda()
adrqn_target.load_state_dict(adrqn.state_dict())
optimizer = torch.optim.Adam(adrqn.parameters(), lr=learning_rate)

INIT THE NETWORK
INIT THE NETWORK


In [6]:
episode_durations = []

for episode in range(M_episodes):
    done = False
    hidden = None
    last_action = 0
    current_return = 0
    env.reset()
    last_observation = env.get_state()
    while not done:
        action, hidden = agent.act(torch.tensor(last_observation).float().view(1, 1, -1).cuda(), F.one_hot(
            torch.tensor(last_action), n_actions).view(1, 1, -1).float().cuda(), eps, adrqn, hidden=hidden)

        observation, reward, done  = env.take_action(action)
        if np.random.rand() < blind_prob:
            # Induce partial observability
            observation = np.zeros_like(observation)

        reward = np.sign(reward)
        current_return += reward
        replay_buffer.write_tuple(
            (last_action, last_observation, action, reward, observation, done))

        last_action = action
        last_observation = observation

        # Updating Networks
        if episode > EXPLORE:
            eps = eps_end + (eps_start - eps_end) * \
                math.exp((-1*(episode-EXPLORE))/eps_decay)

            last_actions, last_observations, actions, rewards, observations, dones = replay_buffer.sample(
                batch_size)
            q_values, _ = adrqn.forward(
                last_observations, F.one_hot(last_actions, n_actions).float())
            q_values = torch.gather(
                q_values, -1, actions.unsqueeze(-1)).squeeze(-1)
            predicted_q_values, _ = adrqn_target.forward(
                observations, F.one_hot(actions, n_actions).float())
            target_values = rewards + \
                (gamma * (1 - dones.float()) *
                 torch.max(predicted_q_values, dim=-1)[0])

            # Update network parameters
            optimizer.zero_grad()
            loss = torch.nn.MSELoss()(q_values, target_values.detach())
            loss.backward()
            optimizer.step()

        if done:
            episode_durations.append(current_return)
            #plot(episode_durations, 10)
            break

    if episode % target_update == 0:
        adrqn_target.load_state_dict(adrqn.state_dict())

    if get_moving_average(10, episode_durations)[-1] >= 8:
        break

l = np.array(episode_durations)
print(l)
print(np.average(l))

[6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
6.0


In [12]:
## Save the model
save(adrqn,"./model/model")

In [15]:
# Load the model
model = load("./model/model",n_actions, state_size, embedding_size)


INIT THE NETWORK
ADRQN(
  (embedder): Linear(in_features=1, out_features=8, bias=True)
  (obs_layer): Linear(in_features=12, out_features=16, bias=True)
  (obs_layer2): Linear(in_features=16, out_features=32, bias=True)
  (lstm): LSTM(40, 128, batch_first=True)
  (out_layer): Linear(in_features=128, out_features=1, bias=True)
)
