In [1]:
import torch
import gym
import numpy as numpy
from agent_pia import Agent, Action_Scheduler

import os, sys
sys.path.append('ext/deepFibreTracking/')

import envs.RLtractEnvironment as RLTe

In [2]:
max_steps = 30000000
replay_memory_size = 20000
agent_history_length = 1
evaluate_every = 200000
eval_runs = 5

max_episode_length = 200


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
env = RLTe.RLtractEnvironment(device = 'cpu')
n_actions = env.action_space.n
#print(n_actions)

agent = Agent(n_actions=n_actions, inp_size=n_actions, device=device, hidden=256, agent_history_length=1, memory_size=20000)

action_scheduler = Action_Scheduler(num_actions=n_actions, max_steps=30000000, replay_memory_start_size=20000, model=agent.main_dqn)

In [4]:
state = env.reset()
print(state.getCoordinate())
print(state.getValue().shape)

[   50 -1000    50]


PointOutsideOfDWIError: While parsing 27 points for further processing, it became apparent that [[   26.5 -1377.    -10.5]
 [   26.5 -1377.     -9.5]
 [   26.5 -1377.     -8.5]
 [   26.5 -1376.    -10.5]
 [   26.5 -1376.     -9.5]
 [   26.5 -1376.     -8.5]
 [   26.5 -1375.    -10.5]
 [   26.5 -1375.     -9.5]
 [   26.5 -1375.     -8.5]
 [   27.5 -1377.    -10.5]
 [   27.5 -1377.     -9.5]
 [   27.5 -1377.     -8.5]
 [   27.5 -1376.    -10.5]
 [   27.5 -1376.     -9.5]
 [   27.5 -1376.     -8.5]
 [   27.5 -1375.    -10.5]
 [   27.5 -1375.     -9.5]
 [   27.5 -1375.     -8.5]
 [   28.5 -1377.    -10.5]
 [   28.5 -1377.     -9.5]
 [   28.5 -1377.     -8.5]
 [   28.5 -1376.    -10.5]
 [   28.5 -1376.     -9.5]
 [   28.5 -1376.     -8.5]
 [   28.5 -1375.    -10.5]
 [   28.5 -1375.     -9.5]
 [   28.5 -1375.     -8.5]] of the points doesn't lay inside of DataContainer 'HCPDataContainer-HCP100307-b0thr-10.0'.

In [5]:
step_counter = 0
    
rewards = []

while step_counter < max_steps:
    state = env.reset()

    episode_reward_sum = 0

    epoch_step = 0

    ######## fill memory begins here
    while epoch_step < evaluate_every:  # To Do implement evaluation
        
        #fill replay memory while interacting with env
        for _ in range(max_episode_length):
            # get action with epsilon-greedy strategy
            try:
                action = action_scheduler.get_action(step_counter, state.getValue().unsqueeze(0))
            except PointOutsideOfDWIError:
                action = n_actions-1

            # perform step on environment
            next_state, reward, terminal = env.step(action)
            

            # increase counter
            step_counter += 1
            epoch_step += 1

            # accumulate reward for current episode
            episode_reward_sum += reward
            

            # add current state, action, reward and terminal flag to memory
            agent.replay_memory.add_experience(action=action,
                                               state=state,
                                               reward=reward,
                                               terminal=terminal)
            
            # prepare for next step
            state = next_state

            ####### optimization is happening here
            if step_counter > agent.replay_memory.size:
                loss = agent.optimize()

            ####### target network update
            if step_counter > agent.replay_memory.size and step_counter % network_update_every == 0:
                agent.target_dqn.load_state_dict(agent.main_dqn.state_dict())
            
            # if episode ended before maximum step
            if terminal:
                terminal = False
                state = env.reset()
                episode_reward_sum = 0
                epoch_step = 0
                break
        rewards.append(episode_reward_sum)

    if len(rewards) % 10 == 0:
        print("[{}], {}, {}".format(len(rewards), step_counter, np.mean(rewards[-100:])))

########## evaluation starting here
    eval_rewards = []
    for _ in range(eval_runs):
        eval_steps = 0
        state = env.reset()
        eval_episode_reward = 0
        while eval_steps < max_episode_length:
            action = action_scheduler(step_counter, state, evaluation=True)

            next_state, reward, terminal = env.step()

            eval_steps += 1
            eval_episode_reward += reward
            state = next_state

            if terminal:
                terminal = False
                break

        eval_rewards.append(eval_episode_reward)
    
    print("Evaluation score:\n", np.mean(eval_rewards))


PointOutsideOfDWIError: While parsing 27 points for further processing, it became apparent that [[   26.5 -1377.    -10.5]
 [   26.5 -1377.     -9.5]
 [   26.5 -1377.     -8.5]
 [   26.5 -1376.    -10.5]
 [   26.5 -1376.     -9.5]
 [   26.5 -1376.     -8.5]
 [   26.5 -1375.    -10.5]
 [   26.5 -1375.     -9.5]
 [   26.5 -1375.     -8.5]
 [   27.5 -1377.    -10.5]
 [   27.5 -1377.     -9.5]
 [   27.5 -1377.     -8.5]
 [   27.5 -1376.    -10.5]
 [   27.5 -1376.     -9.5]
 [   27.5 -1376.     -8.5]
 [   27.5 -1375.    -10.5]
 [   27.5 -1375.     -9.5]
 [   27.5 -1375.     -8.5]
 [   28.5 -1377.    -10.5]
 [   28.5 -1377.     -9.5]
 [   28.5 -1377.     -8.5]
 [   28.5 -1376.    -10.5]
 [   28.5 -1376.     -9.5]
 [   28.5 -1376.     -8.5]
 [   28.5 -1375.    -10.5]
 [   28.5 -1375.     -9.5]
 [   28.5 -1375.     -8.5]] of the points doesn't lay inside of DataContainer 'HCPDataContainer-HCP100307-b0thr-10.0'.