In [5]:
import gym
import numpy as np
import torch
import matplotlib
import matplotlib.pyplot as plt
import time
import tqdm as notebook_tqdm

from collections import deque
from agent import Agent, FloatTensor
# from replay_buffer import ReplayMemory, Transition
# from  torch.autograd import Variable

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

use_cuda = torch.cuda.is_available()
# FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
device = torch.device("cuda" if use_cuda else "cpu")

BATCH_SIZE = 64  
TAU = 0.005 # 1e-3   # for soft update of target parameters
gamma = 0.99
LEARNING_RATE = 0.001
TARGET_UPDATE = 50

num_episodes = 2**12
print_every = 10
hidden_dim = 16 ## 12 ## 32 ## 16 ## 64 ## 16
min_eps = 0.01
max_eps_episode = 50

env = gym.make('CartPole-v1')
        
space_dim =  env.observation_space.shape[0] # n_spaces
action_dim = env.action_space.n # n_actions  
print('input_dim: ', space_dim, ', output_dim: ', action_dim, ', hidden_dim: ', hidden_dim)

threshold = env.spec.reward_threshold
print('threshold: ', threshold)

agent = Agent(space_dim, action_dim, hidden_dim)

input_dim:  4 , output_dim:  2 , hidden_dim:  16
threshold:  475.0


In [6]:
def epsilon_annealing(i_epsiode, max_episode, min_eps: float):
    """
    """
    t = i_epsiode // max_episode
    ret_eps = max(1/(t+1),min_eps)
    return ret_eps        

def save(directory, filename):
    torch.save(agent.q_local.state_dict(), '%s/%s_local.pth' % (directory, filename))
    torch.save(agent.q_target.state_dict(), '%s/%s_target.pth' % (directory, filename))

In [7]:
def run_episode(env, agent, eps):

    state = env.reset()
    done = False
    total_reward = 0
    

    while not done:

        action = agent.get_action(FloatTensor([state]) , eps)
        
        next_state, reward, done, _ = env.step(action.item())

        total_reward += reward

        if done:
            reward = -1
                    
        # Store the transition in memory
        agent.replay_memory.push((FloatTensor([state]),  action,FloatTensor([reward]), 
                 FloatTensor([next_state]), FloatTensor([done])))
                 

        if len(agent.replay_memory) > BATCH_SIZE:

            batch = agent.replay_memory.sample(BATCH_SIZE)
            
            agent.learn(batch, gamma)

        state = next_state


    return total_reward

In [None]:
def train():    

    scores_deque = deque(maxlen=100)
    scores_array = []
    avg_scores_array = []    
    
    time_start = time.time()

    for i_episode in range(num_episodes):
        eps = epsilon_annealing(i_episode, max_eps_episode, min_eps)
        score = run_episode(env, agent, eps)

        scores_deque.append(score)
        scores_array.append(score)
        
        avg_score = np.mean(scores_deque)
        avg_scores_array.append(avg_score)

        dt = (int)(time.time() - time_start)
            
        if i_episode % print_every == 0 and i_episode > 0:
            print('Episode: {:5} Score: {:5}  Avg.Score: {:.2f}, eps-greedy: {:5.2f} Time: {:02}:{:02}:{:02}'.\
                    format(i_episode, score, avg_score, eps, dt//3600, dt%3600//60, dt%60))
            
        if len(scores_deque) == scores_deque.maxlen:
     
            if np.mean(scores_deque) >= threshold: 
                print('\n Environment solved in {:d} episodes!\tAverage Score: {:.2f}'. \
                    format(i_episode, np.mean(scores_deque)))
                break

                        
        if i_episode % TARGET_UPDATE == 0:
            agent.q_target.load_state_dict(agent.q_local.state_dict()) 
            
        if i_episode == num_episodes - 1:
            agent.save_model('models/q_learn')
            
    
    return scores_array, avg_scores_array

scores, avg_scores = train()
save('models/', 'dq')

  action = agent.get_action(FloatTensor([state]) , eps)


Episode:    10 Score:  27.0  Avg.Score: 27.91, eps-greedy:  1.00 Time: 00:00:00
Episode:    20 Score:  34.0  Avg.Score: 25.43, eps-greedy:  1.00 Time: 00:00:00
Episode:    30 Score:  18.0  Avg.Score: 27.74, eps-greedy:  1.00 Time: 00:00:00
Episode:    40 Score:  33.0  Avg.Score: 25.20, eps-greedy:  1.00 Time: 00:00:00
Episode:    50 Score:  15.0  Avg.Score: 23.06, eps-greedy:  0.50 Time: 00:00:01
Episode:    60 Score:  17.0  Avg.Score: 21.85, eps-greedy:  0.50 Time: 00:00:01
Episode:    70 Score:  84.0  Avg.Score: 23.83, eps-greedy:  0.50 Time: 00:00:01
Episode:    80 Score:  37.0  Avg.Score: 27.72, eps-greedy:  0.50 Time: 00:00:01
Episode:    90 Score:  95.0  Avg.Score: 29.36, eps-greedy:  0.50 Time: 00:00:02
Episode:   100 Score:  86.0  Avg.Score: 32.51, eps-greedy:  0.33 Time: 00:00:02
Episode:   110 Score: 147.0  Avg.Score: 35.38, eps-greedy:  0.33 Time: 00:00:03
Episode:   120 Score: 148.0  Avg.Score: 42.57, eps-greedy:  0.33 Time: 00:00:04
Episode:   130 Score: 107.0  Avg.Score: 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

print('length of scores: ', len(scores), ', len of avg_scores: ', len(avg_scores))

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores, label="Score")
plt.plot(np.arange(1, len(avg_scores)+1), avg_scores, label="Avg on 100 episodes")
plt.legend(bbox_to_anchor=(1.05, 1)) 
plt.ylabel('Score')
plt.xlabel('Episodes #')
plt.show()