# Continuous Control

---


### Start the Environment

Run the next code cell to install a few packages. This might take a few minutes to run.

In [1]:
import numpy as np
import gym
import random
import torch
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

from ddpg_agent import Agent

In [2]:

import numpy as np


# select this option to load version 1 (with a single agent) of the environment
# env = UnityEnvironment(file_name='/data/Reacher_One_Linux_NoVis/Reacher_One_Linux_NoVis.x86_64')

# select this option to load version 2 (with 20 agents) of the environment
env = gym.make("BipedalWalker-v3", render_mode='human')

state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
num_agents = 1



Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)

# size of each action
action_size = brain.vector_action_space_size

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]

TypeError: reset() got an unexpected keyword argument 'train_mode'

### Train a DDPG agent

Run the code cells below to train the agent.

In [3]:
agent = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=np.random.randint(100))

In [7]:
def ddpg(n_episodes=5000, max_t=10000, print_every=30):
    scores_deque = deque(maxlen=print_every)
    scores = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = np.zeros(num_agents)
        for t in range(max_t):
            actual_state = state[0] if isinstance(state, tuple) else state
            state = actual_state
            action = agent.act(actual_state)
   
            next_state, reward, done, info,_ = env.step(action)
             

            
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_deque.append(np.mean(score))
        scores.append(np.mean(score))
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
        print('\rEpisode {}\t Score: {:.2f}'.format(i_episode, np.mean(score)))
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            
    return scores

scores = ddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 1	 Score: -101.22-101.22
Episode 2	 Score: -102.08-101.65
Episode 3	 Score: -101.28-101.53
Episode 4	 Score: -101.80-101.59
Episode 5	 Score: -101.23-101.52
Episode 6	 Score: -101.38-101.50
Episode 7	 Score: -101.39-101.48
Episode 8	 Score: -102.13-101.56
Episode 9	 Score: -101.82-101.59
Episode 10	 Score: -101.93-101.62
Episode 11	 Score: -105.86-102.01
Episode 12	 Score: -105.77-102.32
Episode 13	 Score: -105.97-102.60
Episode 14	 Score: -106.19-102.86
Episode 15	 Score: -105.98-103.07
Episode 16	 Score: -789.20-145.95
Episode 17	 Score: -102.15-143.37
Episode 18	 Score: -107.30-141.37
Episode 19	 Score: -103.40-139.37
Episode 20	 Score: -103.30-137.57
Episode 21	 Score: -102.81-135.91
Episode 22	 Score: -102.88-134.41
Episode 23	 Score: -102.45-133.02
Episode 24	 Score: -128.79-132.85
Episode 25	 Score: -121.38-132.39
Episode 26	 Score: -134.73-132.48
Episode 27	 Score: -109.60-131.63
Episode 28	 Score: -112.85-130.96
Episode 29	 Score: -105.38-130.08
Episode 30	 Score: -106

When finished, you can close the environment.

In [12]:
env.close()