# Collaboration and Competition

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the third project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_colwidth', 500)
pd.set_option('expand_frame_repr', True)


In [2]:
import os
import sys

sys.path.append('/home/ubuntu/udacity/deep-reinforcement-learning/p3_collab-compet/Tennis_Linux_NoVis/')
sys.path.append('/home/ubuntu/Unity/ml-agents/ml-agents')
sys.path.append("/home/ubuntu/udacity/deep-reinforcement-learning/python/")

from unityagents import UnityEnvironment

import math
import random
import copy

from collections import namedtuple, deque

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

In [3]:
sys.path.append("/home/ubuntu/anaconda3/envs/gymenv/lib/python3.8/site-packages")
from tensorboardX import SummaryWriter
writer = SummaryWriter(comment="MADDPG Tennis")

Next, we will start the environment!  **_Before running the code cell below_**, change the `file_name` parameter to match the location of the Unity environment that you downloaded.

- **Mac**: `"path/to/Tennis.app"`
- **Windows** (x86): `"path/to/Tennis_Windows_x86/Tennis.exe"`
- **Windows** (x86_64): `"path/to/Tennis_Windows_x86_64/Tennis.exe"`
- **Linux** (x86): `"path/to/Tennis_Linux/Tennis.x86"`
- **Linux** (x86_64): `"path/to/Tennis_Linux/Tennis.x86_64"`
- **Linux** (x86, headless): `"path/to/Tennis_Linux_NoVis/Tennis.x86"`
- **Linux** (x86_64, headless): `"path/to/Tennis_Linux_NoVis/Tennis.x86_64"`

For instance, if you are using a Mac, then you downloaded `Tennis.app`.  If this file is in the same folder as the notebook, then the line below should appear as follows:
```
env = UnityEnvironment(file_name="Tennis.app")
```

In [4]:
env = UnityEnvironment(file_name="/home/ubuntu/udacity/deep-reinforcement-learning/p3_collab-compet/Tennis_Linux/Tennis.x86_64")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [5]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [6]:
device  = torch.device("cuda:1" if torch.cuda.is_available() else "cuda:0")
device

device(type='cuda', index=1)

###  Continuous Control wtih Deep Deterministic Policy Gradients

In [7]:
args = { 
    "BUFFER_SIZE":int(1e6),
    "BATCH_SIZE":1024,  
    "GAMMA":0.99,
    "TAU":2e-3,
    "LR_ACTOR":1e-3,
    "LR_CRITIC":1.1e-3,
    "WEIGHT_DECAY":0.0001,
    "UPDATE_EVERY":5,
    "EXPLORE_NOISE":0.05,
    "FC1_UNITS":64,
    "FC2_UNITS":512,
    "FC3_UNITS":32,
    "seed":0,
    "state_size":24,
    "action_size":2,
    "num_agents":2,
    "device":device,
    'mcritic_path':'/home/ubuntu/udacity/deep-reinforcement-learning/p3_collab-compet/checkpoint_mCritic.pth.bak',
    'agent_p0_path':'/home/ubuntu/udacity/deep-reinforcement-learning/p3_collab-compet/checkpoint_p0.pth.bak',
    'agent_p1_path':'/home/ubuntu/udacity/deep-reinforcement-learning/p3_collab-compet/checkpoint_p1.pth.bak'
}

In [8]:
import math
import random
import copy

from collections import namedtuple, deque
import matplotlib.pyplot as plt
import numpy as np

import model
from agent import Agent
from model import ActorNetwork, CriticNetwork, MCritic
from replaybuffer import ReplayBuffer


WINDOW_SIZE    = 100

NUM_ITER       = 0
MAX_T          = 1000


def MADDPG(n_episodes=20000):
    
    global scores
    
    global NUM_ITER
    
    global scores_window
    
    epsilon      = 1.0

    for i in range(1, n_episodes):                                      # play game for 5 episodes
        env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
        states = env_info.vector_observations                  # get the current state (for each agent)
        scores = np.zeros(args['num_agents'])                          # initialize the score (for each agent)
        
        NUM_ITER += 1
        
        while True:
            
            actions = [agent_p0.act(states[0]),agent_p1.act(states[1])]
            
            actions     = np.clip(actions, -1, 1)                  # all actions between -1 and 1

            env_info    = env.step(actions)[brain_name]           # send all actions to tne environment

            next_states = env_info.vector_observations         # get next state (for each agent)

            rewards     = env_info.rewards                         # get reward (for each agent)

            dones       = env_info.local_done                        # see if episode finished
            
            for agent in agents:
                agents[agent].step(states[agent],actions[agent],rewards[agent],next_states[agent],dones[agent])
            
            scores      += np.max(env_info.rewards)                         # update the max score (for each agent)
            
            
            states       = next_states                               # roll over states to next time step

            if np.any(dones):                                  # exit loop if episode finished
                scores_window.append(scores)
                break

                
        if i % WINDOW_SIZE == 0:
            print('\rEpisode {}\tIterations: {}\tAverage Score: {:.2f}'.format(i, NUM_ITER,np.mean(list(scores_window)[-WINDOW_SIZE:])))
        if np.mean(list(scores_window)[-WINDOW_SIZE:])>=1.0 and n_episodes > WINDOW_SIZE:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i-WINDOW_SIZE, np.mean(list(scores_window)[-WINDOW_SIZE:])))
            torch.save(agent_p0.actor_network.state_dict(), 'checkpoint_p0.pth')
            torch.save(agent_p1.actor_network.state_dict(), 'checkpoint_p1.pth')
            torch.save(mCritic.network.state_dict(), 'checkpoint_mCritic.pth')
            break
        else:
            pass

SharedBuffer  =  ReplayBuffer(args['action_size'], args['BUFFER_SIZE'], args['BATCH_SIZE'], args['seed'])

mCritic       =  MCritic(args['state_size'],args['action_size'], args)
agent_p0      =  Agent(args['state_size'],args['action_size'], 0,args)
agent_p1      =  Agent(args['state_size'],args['action_size'], 1,args)


        
print("===============Agent0 NETS =================")
print(agent_p0.actor_network)
print(agent_p0.actor_target)
print("===============Agent1 NETS =================")
print(agent_p1.actor_network)
print(agent_p1.actor_target)
print("===============CRITIC NETS ================")
print(mCritic.network)
print(mCritic.target)
print("===========================================")

agents   = {0:agent_p0,1:agent_p1}

scores_window = deque(maxlen=WINDOW_SIZE*10)

MADDPG()

ActorNetwork(
  (fc1): Linear(in_features=24, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=2, bias=True)
)
ActorNetwork(
  (fc1): Linear(in_features=24, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=2, bias=True)
)
ActorNetwork(
  (fc1): Linear(in_features=24, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=2, bias=True)
)
ActorNetwork(
  (fc1): Linear(in_features=24, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=2, bias=True)
)
CriticNetwork(
  (fc1): Linear(in_features=26, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=1, bias=True)
)
CriticNetwork(

In [9]:
scores_ = [np.max(x) for x in np.array(scores_window)]

In [10]:
MAX_WINDOW = 1000
WINDOW = 100
episodes = [int(x) for x in np.arange((3288 - MAX_WINDOW),3288,1)]

In [11]:
avg_scores_ = []

for score_idx in range(MAX_WINDOW)[::-1]:
    end_idx = score_idx-WINDOW
    avg_scores_.append(sum(scores_[end_idx:score_idx])/WINDOW)

avg_scores = avg_scores_[::-1]

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,9))
plt.plot(episodes,scores_window)
plt.plot(episodes,avg_scores,'b--')
plt.axhline(0.5)
plt.xlabel("Episodes")
plt.ylabel('Scores')
plt.show()