## Train multi-agents

In [None]:
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
from importlib import reload  # Python 3.4+ only.

import maddpg_agent

In [None]:
print('BUFFER_SIZE: ', maddpg_agent.BUFFER_SIZE)
print('BATCH_SIZE: ', maddpg_agent.BATCH_SIZE)
print('GAMMA: ', maddpg_agent.GAMMA)
print('TAU: ', maddpg_agent.TAU)
print('LR_ACTOR: ', maddpg_agent.LR_ACTOR)
print('LR_CRITIC: ', maddpg_agent.LR_CRITIC)
print('WEIGHT_DECAY: ', maddpg_agent.WEIGHT_DECAY)

In [None]:
# initialize the agent
agent = maddpg_agent.Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=2)

In [None]:
def train_maddpg(n_episodes=2000, max_t=1000, print_every=100): #n_episodes=100, max_t=1000, print_every=100
    scores_deque = deque(maxlen=print_every)
    scores_list = []
    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]      # reset environment
        states = env_info.vector_observations
        
        agent.reset()
        scores = np.zeros(num_agents)
        for t in range(max_t):
            actions = agent.act(states)                        # all actions are already clipped between -1 and 1
            
            env_info = env.step(actions)[brain_name]           # send all actions to the environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished
            
            agent.step(states, actions, rewards, next_states, dones)
            scores += np.max(rewards) #scores += rewards    # update the score (with maximum rewards)
            states = next_states
            
            if np.any(dones):
                break 
        
        score = np.mean(scores)
        scores_deque.append(score)
        scores_list.append(score)
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if ((np.mean(scores_deque) > 0.5) & (i_episode>=1550)) | (np.mean(scores_deque) > 1.5):
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_128_256.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_128_256.pth')
            break
        
    return scores_list

In [None]:
scores = train_maddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
env.close()