### 1. Initialize project environemnt and create an agent

In [None]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name='Tennis.app')

Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In this environment, two agents control rackets to bounce a ball over a net. Thust he observation space consists of 8 variables corresponding to the position and velocity of the ball and racket. Two continuous actions are available, corresponding to movement toward (or away from) the net, and jumping.

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent:', states[0])

In [None]:
import torch
from hyperparameters  import *
from agent import Agent

def seeding(seed=1):
    np.random.seed(seed)
    torch.manual_seed(seed)

seeding(RANDOM_SEED)
agent_0 = Agent(state_size, action_size, random_seed=RANDOM_SEED)
agent_1 = Agent(state_size, action_size, random_seed=RANDOM_SEED)

### 2. Train the agent and store trained model weights into checkpoint.pth file.

FYI: Instead of training the agent from sracth to run in project environment, it is possible to load the stored model weights of already trained agent from please skip steps 2 and 3 and jump directly to step 4.

In [None]:
from collections import deque

def maddpg(n_episodes=2000, max_t=1000, print_every=10):
    scores_deque = deque(maxlen=100)
    scores = []
    avg_scores = []
    for episode_id in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        agent_0.reset()
        agent_1.reset()
        agent_scores = np.zeros(num_agents) # storage for this episode scores for both agaents

        for t in range(max_t):
            action_0 = agent_1.act(states[0], add_noise=True)
            action_1 = agent_1.act(states[1], add_noise=True)
            env_info = env.step([action_0, action_1])[brain_name] # send  actions of both agents to the environment
            next_states = env_info.vector_observations            # get next state (for each agent)
            rewards = env_info.rewards                            # get reward (for each agent)
            dones = env_info.local_done                           # see if episode finished

            agent_0.step(states[0], action_0, rewards[0], next_states[0], dones[0])
            agent_1.step(states[1], action_1, rewards[1], next_states[1], dones[1])

            states = next_states
            scores += rewards
            if np.any(dones):
                break

        episode_score = np.max(agent_scores)  # Consider the maximum score amongs all Agents
        scores_deque.append(episode_score)
        scores.append(episode_score)
        eval_window_avg_score = np.mean(scores_deque)
        avg_scores.append(eval_window_avg_score)
        total_episodes = min(episode_id , 100)
        print('\rEpisode {} average score: {:.2f}, Average score over last {} episodes: {:.2f}'.format(i_episode, np.mean(score), total_episodes, eval_window_avg_score), end="")
        if i_episode % print_every == 0:
            print('\rEpisode {} average score: {:.2f}, Average score over last {} episodes: {:.2f}'.format(i_episode, np.mean(score), total_episodes, eval_window_avg_score))
        if np.mean(scores_deque) >= 30:
            print('\nEnvironment solved in {:d} episodes!\t Average score over last {} episodes: {:.2f}'.format(i_episode, total_episodes, eval_window_avg_score))
            torch.save(agent_0.actor_local.state_dict(), 'checkpoint_actor_0.pth')
            torch.save(agent_0.critic_local.state_dict(), 'checkpoint_critic_0.pth')
            torch.save(agent_1.actor_local.state_dict(), 'checkpoint_actor_1.pth')
            torch.save(agent_1.critic_local.state_dict(), 'checkpoint_critic_1.pth')
            break
    return scores, avg_scores

scores, avg_scores = execute_maddpg()

### 4. Run the Agents with pretrained model weights

Once this cell is executed, a Unity window with loaded project environemnt should pop up where the trained Agents can be observed in action, as they operate inside the environment. The 'experience' of the Agents is loaded from stored ''checkpoint_actor.pth' file. These files contains the weights of Agents' neural netowrk model learned during training phase, therefore no need to train the Agents every time from scratch.

In [None]:
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                    # get the current states
agent.reset()
agent.actor_local.load_state_dict(torch.load('actor_checkpoint.pth'))
while True:
    actions = agent.act(states, False)            # let the agent select actions (don't add noise)
    env_info = env.step(actions)[brain_name]      # execute the selected actions and save the new information about the environment
    next_states = env_info.vector_observations    # get the resulting states
    dones = env_info.local_done                   # check whether episodes have finished
    states = next_states
    if np.any(dones):
        break 
env.close()

To stop the game, close the Unity window with project environment

In [None]:
env.close()