In [None]:
# Import the classes
from Agent import Agent
from model import DQN
from ReplayBuffer import ReplayBuffer

from unityagents import UnityEnvironment
import matplotlib.pyplot as plt
from collections import deque
import numpy as np
import torch

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Initialize the environment
env = UntiyEnvironment(filename="/data/Banana_Linux_NoVis/Banana.x86_64")

# Check the brain name
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)

In [None]:
# Hyperparameters
BUFFER_SIZE = int(1e5)   # replay buffer size
BATCH_SIZE = 64          # minibatch size
GAMMA = 0.99             # discounting the rewards
LR = 5e-3                # learning rate for the agent
UPDATE_EVERY = 5         # how often to update the network
TAU = 1e-3               # for soft update of target parameters

In [1]:
# Train loop

def train_agent(n_episodes, max_t, epsilon_start, epsilon_end, epsilon_decay):
    # Initialize the agent
    agent = Agent(state_size=37, action_size=4, device=device, buffer_size=BUFFER_SIZE, gamma=GAMMA, tau=TAU, lr=LR, update_every=UPDATE_EVERY, fc1=128, fc2=256, fc3=64)
    # Initilalize the scores list
    scores = []
    scores_window = deque(maxlen=100)
    epsilon = epsilon_start
    # Start the episodes
    for episode in range(1, n_episodes):
        # Reset the environment
        state = env.reset(train_mode=True)[brain_name].vector_observations[0]
        score = 0
        # start the time steps in an episode
        for t in range(max_t):
            # select the action
            action = agent.act(state, epsilon)
            # Get the next state, reward, done
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            # Agent takes a step, adds the experience and learns
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)
        scores.append(score)
        epsilon = max(epsilon_end, epsilon_decay*epsilon)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_window)), end="")
        if episode%100==0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_window)))
        if np.mean(scores_window)>=13.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break

        # Save the model weights
        torch.save(agent.local_nw.state_dict(), "weights.pt")
    return scores

In [None]:
res_scores = train_agent(1_800, 1_000, 1.0, 0.01, 0.995)

In [None]:
# Plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel("Episode")
plt.show()