# Continuous Control
---

In [None]:
%load_ext autoreload
%autoreload 2

from unityagents import UnityEnvironment
import numpy as np
from agent.agent import Agent
import torch
import time

from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
env = UnityEnvironment(file_name='Reacher20_Windows_x86_64/Reacher')

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

In [None]:
agent = Agent(state_size, action_size, num_agents=num_agents,seed=42)

In [None]:
from agent.agent import OUNoise
ounoise = OUNoise(4, 42, theta = 0.15,sigma=0.01)
agent.reset()
x = [i for i in range(1001)]
noise = [ounoise.sample() for i in x]
plt.plot(x,noise)
plt.show()

In [None]:
def ddpg(n_episodes=500):  
    scores_deque = deque(maxlen=100)
    scores_avg = [] # initialize the score (for each agent)
    moving_averages = []
    start_time = time.time()
    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]     # reset the environment  
        states = env_info.vector_observations                  # get the current state (for each agent)
        agent.reset()
        scores = np.zeros(num_agents)
        num_iter = 0
        while True:
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                       # see if episode finished
            agent.step(states, actions, rewards, next_states, dones)
            
            states = next_states
            scores += rewards
            if np.any(dones):
                break
        
        score = np.mean(scores)
        scores_deque.append(score)
        scores_avg.append(score)
        moving_averages.append(np.mean(scores_deque))
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f} \tTime: {:.2f} \tNum iter: {}'.format(i_episode, np.mean(scores_deque), score, time.time()-start_time, num_iter))
        if i_episode % 10 == 0:
#             torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_{}.pth'.format(i_episode))
#             torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_{}.pth'.format(i_episode))
            print('\rEpisode {}\tAverage Score: {:.2f} \tMax score: {} \tEpsilon: {:.2f}\tTime: {:.2f}/per episode'.format(i_episode, np.mean(scores_deque), np.max(scores_deque), agent.epsilon, time.time()-start_time))   
        start_time = time.time()
    return scores_avg, moving_averages

scores, moving_averages = ddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores_list)+1), score_list)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
env.close()

In [None]:
states = env_info.vector_observations

In [None]:
actions = agent.act(states)
env_info = env.step(actions)[brain_name]           # send all actions to tne environment
next_states = env_info.vector_observations
rewards = env_info.rewards                         # get reward (for each agent)
dones = env_info.local_done                       # see if episode finished
states = next_states

In [None]:
memory.add(states,actions,rewards, next_states, dones)

In [None]:
states.shape

In [None]:
np.array(rewards).reshape(20,1)

In [None]:
experiences = memory.sample()
states, actions, rewards, next_states, dones = experiences 

In [None]:
states.shape

In [None]:
actions.shape

In [None]:
rewards.shape

In [None]:
dones.shape

In [None]:
actions_next = agent.actor_target(next_states)
Q_targets_next = agent.critic_target(next_states, actions_next)
Q_targets = rewards + (0.99 * Q_targets_next * (1 - dones))

In [None]:
Q_targets_next.shape

In [None]:
from agent.agent import ReplayBuffer 

In [None]:
memory = ReplayBuffer(4, 2, 2, 42)

In [None]:
from agent.agent import OUNoise
noise = [OUNoise(4, 42) for i in range(20)]

In [None]:
noise = np.array([noise[i].sample() for i, n in enumerate(noise)])

In [None]:
agent.noise.sample()

In [None]:
noise

In [None]:
noise*2