In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import math
from DRQN.network import ADRQN
from DRQN.experience_replay import ExpBuffer
from DRQN.agent import Agent
from DRQN.epsilon_greedy_strategy import EpsilonGreedyStrategy
from DRQN.plot import plot, get_moving_average
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%run ./env.ipynb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
df = pd.DataFrame({'iteration': 0,"epsiode_num":0, "reward":0}, index=[0])



In [3]:
def save(model, path):
    torch.save(model.state_dict(), path)    

def load(path,n_actions,state_size, embedding_size):
    model = ADRQN(n_actions,state_size,embedding_size)
    model.load_state_dict(torch.load(path))
    return model   


In [4]:
env = EnvManager(device)
state_size = env.num_state_features()
n_actions = env.num_actions_available()


In [5]:
embedding_size = 16
M_episodes = 2000
memory_size = 1000
sample_length = 100
replay_buffer = ExpBuffer(memory_size, sample_length)
batch_size = 8
eps_start = 0.9
eps = eps_start
eps_end = 0.05
eps_decay = 10
gamma = 0.999
learning_rate = 0.001
blind_prob = 0
EXPLORE = 1000
target_update = 100  # Interval for updating the target net

In [6]:
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)
agent = Agent(strategy, n_actions, device)
adrqn = ADRQN(n_actions, state_size, embedding_size).cuda()
adrqn_target = ADRQN(n_actions, state_size, embedding_size).cuda()
adrqn_target.load_state_dict(adrqn.state_dict())
optimizer = torch.optim.Adam(adrqn.parameters(), lr=learning_rate)

In [7]:
episode_rewards = []

for episode in range(M_episodes):
    done = False
    hidden = None
    last_action = 0
    current_return = 0
    env.reset()
    last_observation = env.get_state()
    episode_duration = 0
    while not done:
        action, hidden = agent.act(torch.tensor(last_observation).float().view(1, 1, -1).cuda(), F.one_hot(
            torch.tensor(last_action), n_actions).view(1, 1, -1).float().cuda(), eps, adrqn, hidden=hidden)

        observation, reward, done  = env.take_action(action)
        if np.random.rand() < blind_prob:
            # Induce partial observability
            observation = np.zeros_like(observation)

        reward = np.sign(reward)
        current_return += reward
        episode_duration =  episode_duration +1
        replay_buffer.write_tuple(
            (last_action, last_observation, action, reward, observation, done))

        last_action = action
        last_observation = observation

        # Updating Networks
        if episode > EXPLORE:
            eps = eps_end + (eps_start - eps_end) * \
                math.exp((-1*(episode-EXPLORE))/eps_decay)

            last_actions, last_observations, actions, rewards, observations, dones = replay_buffer.sample(
                batch_size)
            q_values, _ = adrqn.forward(
                last_observations, F.one_hot(last_actions, n_actions).float())
            q_values = torch.gather(
                q_values, -1, actions.unsqueeze(-1)).squeeze(-1)
            predicted_q_values, _ = adrqn_target.forward(
                observations, F.one_hot(actions, n_actions).float())
            target_values = rewards + \
                (gamma * (1 - dones.float()) *
                 torch.max(predicted_q_values, dim=-1)[0])

            # Update network parameters
            optimizer.zero_grad()
            loss = torch.nn.MSELoss()(q_values, target_values.detach())
            loss.backward()
            optimizer.step()
            
        if done:
            episode_rewards.append(current_return/episode_duration)
            new_row = pd.DataFrame({'iteration': episode,"epsiode_num": env.current_episode_index ,"reward":current_return/episode_duration}, index=[0])
            df = pd.concat([df,new_row],ignore_index=True)
            episode_duration = 0

            break

    if episode % target_update == 0:
        print(episode,episode_rewards[-1])
        #print(episode,get_moving_average(100,episode_rewards)[-1])
        adrqn_target.load_state_dict(adrqn.state_dict())

    
    if get_moving_average(100, episode_rewards)[-1] >= 0.95:
        print(episode,get_moving_average(100,episode_rewards)[-1])
        break


0 -0.14285714285714285
100 0.0967741935483871
200 0.28
300 -0.2
400 0.5384615384615384
500 0.25
600 0.09090909090909091
700 -0.1320754716981132
800 -0.07692307692307693
900 0.09333333333333334
1000 0.07692307692307693
1100 0.8888888888888888
1200 0.9145299145299145
1300 -0.92
1400 1.0
1500 -1.0
1600 1.0
1700 0.2222222222222222
1800 -0.9393939393939394
1900 0.9117647058823529


In [8]:
## Save the model
save(adrqn,"./model/model")

In [9]:
# Load the model
model = load("./model/model",n_actions, state_size, embedding_size)


In [24]:
rightClassified = 0
newDF = pd.DataFrame({"epsiode_num":0, "reward":0}, index=[0])

for j in range(113):
    env.reset(j)
    last_observation = env.get_state()
    episode_duration = 0
    last_action = 0
    current_return = 0
    done = False
    while not done:
            action, hidden = agent.act(torch.tensor(last_observation).float().view(1, 1, -1).cuda(), F.one_hot(
                torch.tensor(last_action), n_actions).view(1, 1, -1).float().cuda(), eps, adrqn, hidden=hidden)

            observation, reward, done  = env.take_action(action)
            reward = np.sign(reward)
            current_return += reward
            episode_duration =  episode_duration +1
            last_action = action
            last_observation = observation 
            if done:
                break

    acc = current_return / episode_duration
    new_row = pd.DataFrame({"epsiode_num": j ,"reward":acc}, index=[0])
    newDF = pd.concat([newDF,new_row],ignore_index=True)

    if (acc >= 0.8):
        rightClassified = rightClassified + 1
    else:
        print(env.get_episode_name())


print(rightClassified)

['Special methods should have an expected number of parameters']
['Instance and class methods should have at least one positional parameter']
['Only defined names should be listed in "__all__"']
['Instance and class methods should have at least one positional parameter']
['Statements should be on separate lines']
108


In [19]:
df

Unnamed: 0,iteration,epsiode_num,reward
0,0,0,0.000000
1,0,0,-0.142857
2,1,1,0.357143
3,2,2,0.111111
4,3,3,-0.310345
...,...,...,...
1996,1995,74,0.920000
1997,1996,75,0.833333
1998,1997,76,1.000000
1999,1998,77,-0.333333


In [23]:
df["iteration"].to_csv("data.csv",index=False)

In [26]:
newDF["reward"].to_csv("data1.csv",index=False)