<b>Weight decay, taken from <a href="https://stackoverflow.com/questions/44452571/what-is-the-proper-way-to-weight-decay-for-adam-optimizer">here</a></b>

When using pure SGD (without momentum) as an optimizer, weight decay is the same thing as adding a L2-regularization term to the loss. When using any other optimizer, this is not true.

In [1]:
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
%matplotlib inline
from ddpg_agent import Agent 

In [2]:
# = = = = = = = = = Enviroment initialization = = = = = = = = = # 
env = UnityEnvironment(file_name='Reacher_Linux/Reacher.x86')
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1] 

print('Number of agents:    ', num_agents)  
print('Size of each action: ', action_size) 
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) 
print('The state for the first agent looks like:  ', states) 

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of agents:     1
Size of each action:  4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like:   [[ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
  -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
   1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
   5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  -1.68164849e-01]]


In [3]:
agent = Agent(state_size=len(states[0]), action_size=action_size, random_seed=10) 
agent_num = 0 
print("Rewards:   ",env_info.rewards[agent_num]) 
print("Observations:   ",env_info.vector_observations[agent_num] ) 
print("Done status:    ",env_info.local_done[agent_num] ) 

Rewards:    0.0
Observations:    [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]
Done status:     False


In [None]:
def ddpg(num_episodes, max_timesteps=500):
    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf 
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations[0]
    
    for ith_episode in range(1, num_episodes+1): 
        agent.reset() 
        score = 0 

        for timestep in range(max_timesteps): 
            action = agent.act(state) 

            env_info = env.step(action) 
            next_state = env_info[brain_name].vector_observations[agent_num]
            reward = env_info[brain_name].rewards[agent_num]
            done = env_info[brain_name].local_done[agent_num] 
            
            agent.step(state, action, reward, next_state, done) 

            state = next_state 
            score += reward
            if done:
                break
        
        scores_deque.append(score) 
        scores.append(score) 
        
        if ith_episode % 10 == 0:        
            print("Episode: {}\t Average score: {}\t Score: {}".format(ith_episode, np.mean(scores_deque), score)) 

        if ith_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), "checkpoint_actor.pth")   
            torch.save(agent.critic_local.state_dict(), "checkpoint_critic.pth") 
            print("Episode: {}\t Average score: {}".format(ith_episode, np.mean(scores_deque))) 

    return scores 

scores = ddpg(num_episodes=2000) 

fig = plt.figure() 
ax = fig.add_subplot(111) 
plt.plot(np.arange(1, len(scores)+1), scores) 
plt.ylabel('Score') 
plt.xlabel('Episode #') 
plt.show() 



Episode: 10	 Average score: 0.09299999792128802	 Score: 0.7099999841302633
Episode: 20	 Average score: 0.14999999664723873	 Score: 0.07999999821186066
Episode: 30	 Average score: 0.19499999564141035	 Score: 0.0
Episode: 40	 Average score: 0.1677499962504953	 Score: 0.0
Episode: 50	 Average score: 0.1599999964237213	 Score: 0.0
Episode: 60	 Average score: 0.1333333303531011	 Score: 0.0
Episode: 70	 Average score: 0.11585714026753391	 Score: 0.1099999975413084
Episode: 80	 Average score: 0.10137499773409217	 Score: 0.0
Episode: 90	 Average score: 0.09011110909697083	 Score: 0.0
Episode: 100	 Average score: 0.08109999818727374	 Score: 0.0
Episode: 100	 Average score: 0.08109999818727374
Episode: 110	 Average score: 0.07179999839514493	 Score: 0.0
Episode: 120	 Average score: 0.05329999880865216	 Score: 0.0
Episode: 130	 Average score: 0.024799999445676804	 Score: 0.0
Episode: 140	 Average score: 0.016799999624490736	 Score: 0.0
Episode: 150	 Average score: 0.0048999998904764655	 Score: 0.

### Default values: 
- BUFFER_SIZE = int(1e6)  # replay buffer size
- BATCH_SIZE = 128        # minibatch size
- GAMMA = 0.99            # discount factor
- TAU = 1e-3              # for soft update of target parameters
- LR_ACTOR = 1e-4         # learning rate of the actor 
- LR_CRITIC = 3e-4        # learning rate of the critic
- WEIGHT_DECAY = 0.0001   # L2 weight decay

1. Cancelled after 700 timesteps, no change

2. Increased max_timesteps to 1000. No change after 130 episodes 

3. Changed: max_timesteps=500, LR_actor = 1e-3, LR_CRITIC=3e-3. Switched to GPU

4. Default values, 1200 episides no change, 1000 timesteps 

5. Episodes: 2000, max_steps=500. Score shows up, avg hovers at 0.1 to 0.13
    BUFFER_SIZE = int(1e6)  # replay buffer size
    BATCH_SIZE = 128        # minibatch size
    GAMMA = 0.99            # discount factor
    TAU = 1e-3              # for soft update of target parameters
    LR_ACTOR = 1e-4         # learning rate of the actor 
    LR_CRITIC = 3e-4        # learning rate of the critic
    WEIGHT_DECAY = 0.01   # L2 weight decay
    
6. 1000 episodes, maxsteps=500 
    No change in the average score after increasing the learning rates from 1e-4 to 1e-3 
    
7. 1000 episodes, maxsteps=700, learning rates are both 1e-3. Apparent increase 

8. 2000 episodes, max_timesteps=500
- BUFFER_SIZE = int(1e6)  # replay buffer size
- BATCH_SIZE = 128        # minibatch size
- GAMMA = 0.99            # discount factor
- TAU = 1e-3              # for soft update of target parameters
- LR_ACTOR = 1e-4         # learning rate of the actor 
- LR_CRITIC = 3e-4        # learning rate of the critic
- WEIGHT_DECAY = 0.1   # L2 weight decay

9. Episodes=2000, max_timesteps=500. avg_score ends at 0.-14
    BUFFER_SIZE = int(1e6)  # replay buffer size
    BATCH_SIZE = 128        # minibatch size
    GAMMA = 0.99            # discount factor
    TAU = 1e-3              # for soft update of target parameters
    LR_ACTOR = 1e-3         # learning rate of the actor 
    LR_CRITIC = 3e-3        # learning rate of the critic
    WEIGHT_DECAY = 0.1   # L2 weight decay
    
10. Episodes: 600, max_timesteps=500. Increased from 0.08 to 0.12. Non-steady increase, peak in the middle at 0.15 
    BUFFER_SIZE = int(1e6)  # replay buffer size
    BATCH_SIZE = 128        # minibatch size
    GAMMA = 0.99            # discount factor
    TAU = 1e-3              # for soft update of target parameters
    LR_ACTOR = 1e-4         # learning rate of the actor 
    LR_CRITIC = 3e-4        # learning rate of the critic
    WEIGHT_DECAY = 0.1   # L2 weight decay 
    
11. episodes: 5000, max_timesteps=500. No change 
    BUFFER_SIZE = int(1e6)  # replay buffer size
    BATCH_SIZE = 128        # minibatch size
    GAMMA = 0.999            # discount factor
    TAU = 0.001              # for soft update of target parameters
    LR_ACTOR = 1e-5         # learning rate of the actor 
    LR_CRITIC = 3e-5        # learning rate of the critic
    WEIGHT_DECAY = 0.1   # L2 weight decay 
    



12. Very low average values that change slowly. Learning rate appears to be very slow
    episodes: 800 
    max_timesteps=500. 
    BUFFER_SIZE = int(1000000)  # replay buffer size
    BATCH_SIZE = 128        # minibatch size
    GAMMA = 0.999            # discount factor
    TAU = 0.001              # for soft update of target parameters
    LR_ACTOR = 1e-5         # learning rate of the actor 
    LR_CRITIC = 3e-4        # learning rate of the critic
    WEIGHT_DECAY = 0.001   # L2 weight decay 




13. Intermittent negligible scores (0.003 avg) 
    episodes: 800 
    max_timesteps=500. 
    BUFFER_SIZE = int(1000000)  # replay buffer size
    BATCH_SIZE = 128        # minibatch size
    GAMMA = 0.99            # discount factor
    TAU = 0.001              # for soft update of target parameters
    LR_ACTOR = 1e-5         # learning rate of the actor 
    LR_CRITIC = 3e-4        # learning rate of the critic
    WEIGHT_DECAY = 0.001   # L2 weight decay 




14  Training crashed, kernel dead twice after 400 episodes. No learning above 0.004
    max_timesteps=500. 
    BUFFER_SIZE = int(1000000)  # replay buffer size
    BATCH_SIZE = 128        # minibatch size
    GAMMA = 0.99            # discount factor
    TAU = 0.01              # for soft update of target parameters
    LR_ACTOR = 1e-4         # learning rate of the actor 
    LR_CRITIC = 3e-3        # learning rate of the critic
    WEIGHT_DECAY = 0.1   # L2 weight decay 



15 No learning above 0.0006, repeats of same values 10 episodes in a row. Stopped after 350 episodes 
    BUFFER_SIZE = int(1000000)  # replay buffer size
    BATCH_SIZE = 128        # minibatch size
    GAMMA = 0.95            # discount factor
    TAU = 0.1              # for soft update of target parameters
    LR_ACTOR = 1e-4         # learning rate of the actor 
    LR_CRITIC = 3e-3        # learning rate of the critic
    WEIGHT_DECAY = 0.1   # L2 weight decay 



16 Removed custom weight decay for Adam. Reached .19 after 30 episodes, dropped to 0.004 afterwards 
BUFFER_SIZE = int(1000000)  # replay buffer size
BATCH_SIZE = 128        # minibatch size
GAMMA = 0.995            # discount factor
TAU = 1e-2              # for soft update of target parameters
LR_ACTOR = 1e-3         # learning rate of the actor 
LR_CRITIC = 3e-3        # learning rate of the critic


17. Trained for 9000 episodes. Initial bump at episode 30, not above 0.007 for several thousand. 
    BUFFER_SIZE = int(1000000)  # replay buffer size
    BATCH_SIZE = 128        # minibatch size
    GAMMA = 0.995            # discount factor
    TAU = 1e-2              # for soft update of target parameters
    LR_ACTOR = 1e-3         # learning rate of the actor 
    LR_CRITIC = 3e-3        # learning rate of the critic


18  Trained for 1200 episodes. Increase to .5 at 300 episodes, then steady drop. 
    BUFFER_SIZE = int(1000000)  # replay buffer size
    BATCH_SIZE = 256        # minibatch size
    GAMMA = 0.99            # discount factor
    TAU = 1e-3              # for soft update of target parameters
    LR_ACTOR = 1e-4         # learning rate of the actor 
    LR_CRITIC = 3e-3        # learning rate of the critic
    WEIGHT_DECAY = 0   # L2 weight decay 


19  INCREASED BATCH SIZE TO 1000Trained for 1200 episodes. Increase to .5 at 300 episodes, then steady drop.


20  Increased to .7 halfway, then descended to 0.05 by episode 1000, max_timesteps=2000 by accident 
    BUFFER_SIZE = int(1000000)  # replay buffer size
    BATCH_SIZE = 512        # minibatch size
    GAMMA = 0.99            # discount factor
    TAU = 1e-3              # for soft update of target parameters
    LR_ACTOR = 1e-4         # learning rate of the actor 
    LR_CRITIC = 3e-3        # learning rate of the critic
    WEIGHT_DECAY = 0   # L2 weight decay 


21  Highest score yet. Increased to avg of 2.0 at 230 episodes, then quickly fell to 0.1 
    BUFFER_SIZE = int(1000000)  # replay buffer size
    BATCH_SIZE = 1024        # minibatch size
    GAMMA = 0.99            # discount factor
    TAU = 1e-3              # for soft update of target parameters
    LR_ACTOR = 1e-4         # learning rate of the actor 
    LR_CRITIC = 3e-3        # learning rate of the critic
    WEIGHT_DECAY = 0   # L2 weight decay 


22 othing after 130 episodes except a slow increase from 0.007 to 0.059
max_timesteps=1000, episodes=1000
BUFFER_SIZE = int(1000000)  # replay buffer size
BATCH_SIZE = 1024        # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR_ACTOR = 1e-5         # learning rate of the actor 
LR_CRITIC = 3e-4        # learning rate of the critic
WEIGHT_DECAY = 0   # L2 weight decay 


23  Decreasing critic network size, removed third layer fc layer out of 4. Changing both learning rates to 1e-4. Killed at 100 episodes, too random
max_timesteps=1000, episodes=1000
BUFFER_SIZE = int(1000000)  # replay buffer size
BATCH_SIZE = 1024        # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR_ACTOR = 1e-5         # learning rate of the actor 
LR_CRITIC = 3e-4        # learning rate of the critic
WEIGHT_DECAY = 0   # L2 weight decay 


24  LR for both =1e-3. No change or convergence
max_timesteps=1000, episodes=1000
BUFFER_SIZE = int(1000000)  # replay buffer size
BATCH_SIZE = 1024        # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR_ACTOR = 1e-3         # learning rate of the actor 
LR_CRITIC = 3e-3        # learning rate of the critic
WEIGHT_DECAY = 0   # L2 weight decay 

25 No luck, no increase of average score over 1. Switching to weights and biases
    "BUFFER_SIZE": int(1000000),     # replay buffer size 
    "BATCH_SIZE" : 1024,             # minibatch size 
    "GAMMA" : 0.99,                  # discount factor 
    "TAU" : 1e-3,                    # for soft update of target parameters 
    "LR_ACTOR" : 1e-4,               # learning rate of the actor   
    "LR_CRITIC" : 3e-3,              # learning rate of the critic  
    "WEIGHT_DECAY": 0 ,              # L2 weight decay
    "num_episodes": 1000,
    "max_timesteps": 1000

### Session 21
Episode: 10	 Average score: 0.2679999940097332	 Score: 0.0
Episode: 20	 Average score: 0.4489999899640679	 Score: 0.6199999861419201
Episode: 30	 Average score: 0.5416666545594732	 Score: 0.6099999863654375
Episode: 40	 Average score: 0.7479999832808971	 Score: 1.219999972730875
Episode: 50	 Average score: 0.8393999812379479	 Score: 1.0099999774247408
Episode: 60	 Average score: 0.987499977927655	 Score: 0.23999999463558197
Episode: 70	 Average score: 1.026714262765433	 Score: 1.339999970048666
Episode: 80	 Average score: 1.1418749744771048	 Score: 1.2999999709427357
Episode: 90	 Average score: 1.2231110837724475	 Score: 0.5099999886006117
Episode: 100	 Average score: 1.2948999710567295	 Score: 4.619999896734953
Episode: 100	 Average score: 1.2948999710567295
Episode: 110	 Average score: 1.4751999670267104	 Score: 1.409999968484044
Episode: 120	 Average score: 1.594299964364618	 Score: 3.05999993160367
Episode: 130	 Average score: 1.663799962811172	 Score: 1.3799999691545963
Episode: 140	 Average score: 1.7327999612689018	 Score: 2.0199999548494816
Episode: 150	 Average score: 1.7736999603547157	 Score: 1.4199999682605267
Episode: 160	 Average score: 1.726399961411953	 Score: 1.289999971166253
Episode: 170	 Average score: 1.7759999603033065	 Score: 1.1599999740719795
Episode: 180	 Average score: 1.8208999592997133	 Score: 1.50999996624887
Episode: 190	 Average score: 1.8731999581307173	 Score: 2.229999950155616
Episode: 200	 Average score: 1.8926999576948582	 Score: 0.9199999794363976
Episode: 200	 Average score: 1.8926999576948582
Episode: 210	 Average score: 1.920399957075715	 Score: 1.389999968931079
Episode: 220	 Average score: 1.9735999558866024	 Score: 3.6199999190866947
Episode: 230	 Average score: 2.027599954679608	 Score: 0.3499999921768904
Episode: 240	 Average score: 1.9000999575294555	 Score: 0.8499999810010195
Episode: 250	 Average score: 1.812499959487468	 Score: 1.0799999758601189
Episode: 260	 Average score: 1.734899961221963	 Score: 0.0
Episode: 270	 Average score: 1.602999964170158	 Score: 0.06999999843537807
Episode: 280	 Average score: 1.4343999679386616	 Score: 0.2699999939650297
Episode: 290	 Average score: 1.2123999729007482	 Score: 0.3199999928474426
Episode: 300	 Average score: 1.010499977413565	 Score: 0.3899999912828207
Episode: 300	 Average score: 1.010499977413565
Episode: 310	 Average score: 0.8032999820448459	 Score: 0.04999999888241291
Episode: 320	 Average score: 0.6005999865755439	 Score: 0.03999999910593033
Episode: 330	 Average score: 0.41679999068379403	 Score: 0.0
Episode: 340	 Average score: 0.34829999221488833	 Score: 0.0
Episode: 350	 Average score: 0.2804999937303364	 Score: 0.35999999195337296
Episode: 360	 Average score: 0.237699994686991	 Score: 0.0
Episode: 370	 Average score: 0.19489999564364552	 Score: 0.0
Episode: 380	 Average score: 0.13519999697804452	 Score: 0.0
Episode: 390	 Average score: 0.12949999710544943	 Score: 0.0
Episode: 400	 Average score: 0.12959999710321427	 Score: 0.0
Episode: 400	 Average score: 0.12959999710321427
Episode: 410	 Average score: 0.13269999703392388	 Score: 0.019999999552965164
Episode: 420	 Average score: 0.10439999766647816	 Score: 0.0
Episode: 430	 Average score: 0.10359999768435955	 Score: 0.0
Episode: 440	 Average score: 0.09789999781176448	 Score: 0.0
Episode: 450	 Average score: 0.1006999977491796	 Score: 0.0
Episode: 460	 Average score: 0.10409999767318368	 Score: 0.0
Episode: 470	 Average score: 0.12559999719262124	 Score: 1.1199999749660492
Episode: 480	 Average score: 0.14619999673217535	 Score: 0.17999999597668648
Episode: 490	 Average score: 0.18189999593421816	 Score: 0.4899999890476465
Episode: 500	 Average score: 0.2118999952636659	 Score: 0.25999999418854713
Episode: 500	 Average score: 0.2118999952636659
Episode: 510	 Average score: 0.1982999955676496	 Score: 0.7099999841302633
Episode: 520	 Average score: 0.21289999524131417	 Score: 0.0
Episode: 530	 Average score: 0.23559999473392965	 Score: 0.0
Episode: 540	 Average score: 0.23329999478533864	 Score: 0.0
Episode: 550	 Average score: 0.23959999464452267	 Score: 0.0
Episode: 560	 Average score: 0.31049999305978415	 Score: 0.6499999854713678
Episode: 570	 Average score: 0.3582999919913709	 Score: 0.3799999915063381
Episode: 580	 Average score: 0.43089999036863447	 Score: 0.8599999807775021
Episode: 590	 Average score: 0.509799988605082	 Score: 0.9299999792128801
Episode: 600	 Average score: 0.4960999889113009	 Score: 0.0
Episode: 600	 Average score: 0.4960999889113009
Episode: 610	 Average score: 0.5112999885715545	 Score: 0.0
Episode: 620	 Average score: 0.5495999877154827	 Score: 0.0
Episode: 630	 Average score: 0.5696999872662127	 Score: 1.1399999745190144
Episode: 640	 Average score: 0.6477999855205416	 Score: 1.579999964684248
Episode: 650	 Average score: 0.7190999839268625	 Score: 1.219999972730875
Episode: 660	 Average score: 0.7593999830260872	 Score: 1.1399999745190144
Episode: 670	 Average score: 0.7678999828360975	 Score: 0.3399999924004078
Episode: 680	 Average score: 0.7648999829031528	 Score: 1.649999963119626
Episode: 690	 Average score: 0.7514999832026661	 Score: 1.529999965801835
Episode: 700	 Average score: 0.8196999816782773	 Score: 1.649999963119626
Episode: 700	 Average score: 0.8196999816782773
Episode: 710	 Average score: 0.8417999811843038	 Score: 0.7499999832361937
Episode: 720	 Average score: 0.8165999817475676	 Score: 0.18999999575316906
Episode: 730	 Average score: 0.803199982047081	 Score: 1.1799999736249447
Episode: 740	 Average score: 0.7572999830730259	 Score: 0.3199999928474426
Episode: 750	 Average score: 0.6740999849326909	 Score: 0.09999999776482582
Episode: 760	 Average score: 0.5576999875344336	 Score: 0.0
Episode: 770	 Average score: 0.478799989297986	 Score: 0.0
Episode: 780	 Average score: 0.3917999912425876	 Score: 0.07999999821186066
Episode: 790	 Average score: 0.3113999930396676	 Score: 0.1099999975413084
Episode: 800	 Average score: 0.23299999479204417	 Score: 0.0
Episode: 800	 Average score: 0.23299999479204417
Episode: 810	 Average score: 0.1978999955765903	 Score: 0.1099999975413084
Episode: 820	 Average score: 0.17729999603703617	 Score: 0.8599999807775021
Episode: 830	 Average score: 0.14389999678358437	 Score: 0.08999999798834324
Episode: 840	 Average score: 0.1374999969266355	 Score: 0.0
Episode: 850	 Average score: 0.15539999652653932	 Score: 0.0
Episode: 860	 Average score: 0.18179999593645335	 Score: 0.0
Episode: 870	 Average score: 0.18419999588280916	 Score: 0.0
