# Crane V0 Simulations


Adapted from https://github.com/kinwo/deeprl-navigation (MIT License Copyright (c) 2018 Henry Chan)

Start Environement and create DQN Agent

In [1]:
import gym
import numpy as np

env = gym.make('crane-v0') #Load the environement

A 2 linear hidden layer of 64 nodes each is created, with relu activation function. 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class QNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

## DQN Agent Training

### Create DQN Agent

In [None]:
import torch
import time
from collections import deque

#from agent_script import Agent   # UNCOMMENT IF YOU ARE NOT IN A JUPYTER NOTEBOOK

import matplotlib.pyplot as plt
%matplotlib inline

state_size=4
action_size=3
seed=0

agent = Agent(state_size=4, action_size=3, seed=0)

In order to know when the environement is solved, we compute the moving score (the total rewards per episode) average over the last 100 episodes. If the moving average is over a chosen thershold (target_scores), the model is then saved to 'model_weight_name'.

For the Crane_v0 environement, the target score is 100 000, since it is the reward obtained by the agent when finding the flag. 

In [None]:
model_weight_name = 'checkpoint_precise_2.pth'


def dqn(n_episodes=10000, max_t=2000, eps_start=1.0, eps_end=0.01, eps_decay=0.997, target_scores=100000.0):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        target_scores (float): average scores aming to achieve, the agent will stop training once it reaches this scores
    """
    start = time.time()                # Start time
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    
    for i_episode in range(1, n_episodes+1):
        # Reset env and score at the beginning of episode
        env_info = env.reset()                             # reset the environment
        state = env.state                                  # get the current state
        score = 0                                          # initialize the score
        
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)                    # send the action to the environment
            next_state = env_info[0]                       # get the next state
            reward = env_info[1]                           # get the reward
            done = env_info[2]                             # see if episode has finished
            
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                print("    Episode finished after {} timesteps".format(t+1))
                #print("final state is :", state)
                #print("Reward is : ", score)
                break 

        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        
        if np.mean(scores_window)>=target_scores:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), model_weight_name)
            break

    time_elapsed = time.time() - start
    print("Time Elapse: {:.2f}".format(time_elapsed))
    
    return scores

scores = dqn(n_episodes=2000, max_t=1500, eps_start=1.0, eps_end=0.01, eps_decay=0.997, target_scores=100000.0)




### Score plot for each episodes during training

In [None]:
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig('plots/model_training.png', dpi = 200) #UNCOMMENT TO SAVE PLOT
plt.show()

## Analysis and inspections

### Watch the agent running ( Using saved weights)

In [None]:
# load the weights from file
agent.qnetwork_local.load_state_dict(torch.load('checkpoint_precise_2.pth'))
#agent.qnetwork_local.load_state_dict(torch.load('checkpoint_precise.pth'))



env_info = env.reset()                             # reset the environment
state = env.state                                  # get the current state
score = 0                                          # initialize the score

t = 0
while True:
    env.render()
    time.sleep(0.008)
    action = agent.act(state)                      # select an action
    env_info = env.step(action)                    # send the action to the environment
    next_state = env_info[0]                       # get the next state
    reward = env_info[1]                           # get the reward
    done = env_info[2]                             # see if episode has finished
    score += reward                                # update the score
    state = next_state  # roll over the state to next time step
    t += 1
    if done:                                       # exit loop if episode finished
        print("final state is :", state)
        print("Total steps : ", t)
        break
    
print("Score: {}".format(score))
env.close()

### Calculate time steps and time before solving environement

In [None]:
#Choose model
agent.qnetwork_local.load_state_dict(torch.load('checkpoint_precise.pth'))

env_info = env.reset()                             # reset the environment
state = env.state                                  # get the current state
score = 0                                          # initialize the score

t = 0
while True:
    env.render()
    #time.sleep(0.02)                               # Actual time step
    action = agent.act(state)                      # select an action
    env_info = env.step(action)                    # send the action to the environment
    next_state = env_info[0]                       # get the next state
    reward = env_info[1]                           # get the reward
    done = env_info[2]                             # see if episode has finished
    score += reward                                # update the score
    state = next_state  # roll over the state to next time step
    t += 1
    if done:                                       # exit loop if episode finished
        print("\r final state is :", state)
        print("\r Total steps : ", t)
        print("\r Total time is : ", env.tau * t)
        break
    
print("Score: {}".format(score))
env.close()

### Plot state graphs against time

In [None]:
#Choose model
agent.qnetwork_local.load_state_dict(torch.load('checkpoint_precise.pth'))
agent.qnetwork_local.state_dict() #UNCOMMENT TO USE LAST TRAINED WEIGHTS


env_info = env.reset()                             # reset the environment
state = env.state                                  # get the current state
score = 0                                          # initialize the score

arr_x = []
arr_x_dot = []
arr_theta = []
arr_theta_dot = []
arr_t = []

t = 0.0
while True:
    env.render()
    #time.sleep(0.008)
    action = agent.act(state)                      # select an action
    env_info = env.step(action)                    # send the action to the environment
    next_state = env_info[0]                       # get the next state
    reward = env_info[1]                           # get the reward
    done = env_info[2]                             # see if episode has finished
    score += reward                                # update the score
    state = next_state  # roll over the state to next time step
    t += 1.0
    arr_t.append(t)
    arr_x.append(state[0])
    arr_x_dot.append(state[1])
    arr_theta.append(state[2])
    arr_theta_dot.append(state[3])
    
    if done:                                       # exit loop if episode finished
        print("Total steps : ", t)
        print("Total time is : ", env.tau * t)
        break
arr_t = 0.02*np.array(arr_t)
print("Score: {}".format(score))
env.close()




In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(arr_t, arr_x, label='Cart Position')
plt.ylabel('X Position [m]')
plt.xlabel('Time [s]')
plt.title('Position / Time')
plt.hlines(1.0, 0, arr_t[-1], colors='r', linestyles='solid', label='Goal Position')
plt.legend()
plt.grid()
#plt.savefig('plots/model_1_x.png', dpi = 200)
plt.show()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(arr_t, arr_x_dot, label = 'Cart Velocity')
plt.ylabel('X Velocity [m / s]')
plt.xlabel('Time [s]')
plt.title('Velocity / Time')
plt.hlines(0.0, 0, arr_t[-1], colors='r', linestyles='solid', label='Goal Velocity')
plt.legend()
plt.grid()
#plt.savefig('plots/model_1_x_dot.png', dpi = 200)
plt.show()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(arr_t, arr_theta, label = 'Pole angle')
plt.ylabel('Theta [rad]')
plt.xlabel('Time [s]')
plt.title('Theta / Time')
plt.hlines(np.pi, 0, arr_t[-1], colors='r', linestyles='solid', label='Goal Angle')
plt.legend()
plt.grid()
#plt.savefig('plots/model_1_theta.png', dpi = 200)
plt.show()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(arr_t, arr_theta_dot, label = 'Pole Anglular Velocity')
plt.ylabel('Angular Velocity [rad / s]')
plt.xlabel('Time [s]')
plt.title('Angular Velocity / Time')
plt.hlines(0.0, 0, arr_t[-1], colors='r', linestyles='solid', label='Goal Velocity')
plt.legend()
plt.grid()
#plt.savefig('plots/model_1_theta_dot.png', dpi = 200)
plt.show()

initial_state = [arr_x[0], arr_x_dot[0], arr_theta[0], arr_theta_dot[0]]
print('initial state is : ', initial_state)