In [1]:
# coding: utf-8
from DQN import *
import gym
from quanser_robots.common import GentlyTerminating


'''Load the configuration setttings'''
config_path = "config.yml"
print_config(config_path)
config = load_config(config_path)
training_config = config["training_config"]
seed = training_config["random_seed"]
n_episodes = training_config["n_episodes"]
max_episode_step = training_config["max_episode_step"]
n_update_target = training_config["n_update_target"]
exp_number = training_config["exp_number"]
save_model_path = training_config["save_model_path"]
render_flag = training_config["render"]
save_best = training_config["save_best"]
save_thres = training_config["save_thres"]

'''Use fixed epsilon or use a exponential function decay?'''
if training_config["use_fix_epsilon"]:
    epsilon_by_frame = lambda frame_idx: training_config["fix_epsilon"]
else:
    epsilon_start = training_config["epsilon_start"]
    epsilon_final = training_config["epsilon_final"]
    epsilon_decay = training_config["epsilon_decay"]
    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * np.exp(-1. * frame_idx / epsilon_decay)
torch.manual_seed(seed)
np.random.seed(seed)

'''Environment initialization'''
env_id = "Qube-v0"
env = GentlyTerminating(gym.make(env_id))



'''Initialize the DQN algorithm object'''
policy = Policy(env,config)


                                                                    


************************
*** model configuration ***
load_model: true
model_path: storage/exp_6.ckpt
n_actions: 9
n_hidden: 1
size_hidden: 256
use_cuda: true

*** train configuration ***
batch_size: 64
epsilon_decay: 1000
epsilon_final: 0.05
epsilon_start: 0.9
exp_number: 66
fix_epsilon: 0.1
gamma: 0.99
learning_rate: 0.0001
max_episode_step: 500
memory_size: 100000
n_episodes: 25000
n_update_target: 6
random_seed: 1234
render: false
save_best: true
save_model_path: storage/exp_6_rr.ckpt
save_thres: 510
use_fix_epsilon: true

************************




Use CUDA


In [2]:
import pickle
# Restore from a file
f = open('storage/data_rrr.pkl', 'rb')
data = pickle.load(f)

TypeError: _reconstruct: First argument must be a sub-type of ndarray

In [None]:
replay_buffer = policy.replay_buffer

for i in range(len(data_rr)):
    replay_buffer.push(data_rr[i][0],data_rr[i][1],data_rr[i][2],data_rr[i][3],data_rr[i][4])

In [None]:

losses = []
all_rewards = []
avg_rewards = []
epsilons = []

'''Training the q-network with n episodes'''
for i_episode in range(n_episodes):
    episode_reward = 0
    state = env.reset()
    state[4:6]/=20
    epsilon = epsilon_by_frame(i_episode)
    epsilons.append(epsilon)
    for step in range(max_episode_step):
        if render_flag:
            env.render()
        '''Choose action'''
        action = policy.act(state, epsilon)
        f_action = 5*(action-(policy.n_actions-1)/2)/((policy.n_actions-1)/2)
        next_state, reward, done, _ = env.step(f_action)
        reward = 100*(reward)
        next_state[4:6]/=20
        policy.replay_buffer.push(state, action[0], reward, next_state, done)
        state = next_state
        episode_reward += reward

        if done:
            break

        if len(policy.replay_buffer) > policy.batch_size:
            loss = policy.train()
            losses.append(loss.item())

    all_rewards.append(episode_reward)
    avg_rewards.append(np.mean(all_rewards[-10:]))

    if i_episode % 50 == 0:
        '''Save the results figure every 50 episodes'''
        save_fig(i_episode, all_rewards,avg_rewards, losses,epsilons, exp_number)

    if i_episode % n_update_target == 0:
        '''Update the target network'''
        policy.update_target()

    policy.save_model(save_model_path)
    if save_best and i_episode>100:
        ratio = 1.1
        if episode_reward > ratio*np.mean(all_rewards[-10:]):
            print("Save model with episode reward %s " % (episode_reward))
            print("Model path: %s " % (save_model_path))
            break

env.close()