In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gym
import numpy as np
import random
from collections import namedtuple, deque
import wandb

GAMMA=0.99

class QNetwork1(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
        super(QNetwork1, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc_advantage = nn.Linear(fc2_units, action_size)
        self.fc_value = nn.Linear(fc2_units, 1)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        value = self.fc_value(x)
        advantage = self.fc_advantage(x)
        Q_values = value + (advantage - advantage.max(dim=1, keepdim=True)[0])
        return Q_values
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

class TutorialAgent1:
    def __init__(self, state_size, action_size, seed, lr, update_every, buffer_size, batch_size):
        self.state_size = state_size
        print(update_every)
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.qnetwork_local = QNetwork1(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork1(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        self.t_step = 0
        self.update_every = update_every
        self.batch_size = batch_size

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) >= self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        Q_targets_next = self.qnetwork_target(next_states)
        next_state_values = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * next_state_values * (1 - dones))
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
import matplotlib.pyplot as plt
def dqn(agent, env, n_episodes=10000, max_t=500, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    scores_window = deque(maxlen=100)
    all_scores = np.array([])
    moving_avg_scores = np.array([])
    eps = eps_start
    for i_episode in range(1, n_episodes+1):
        state,_ = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, truncated,_ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done or truncated:
                break
        scores_window.append(score)
        all_scores = np.append(all_scores, score)
        eps = max(eps_end, eps_decay * eps)
        moving_avg_scores = np.append(moving_avg_scores, np.mean(scores_window))
        if i_episode == 250:
            break
    # plt.plot(np.arange(len(moving_avg_scores)), moving_avg_scores)
    # plt.ylabel('Moving Average Scores')
    # plt.xlabel('Episode #')
    # plt.title('Moving Average Scores over Episodes')
    # plt.show()
    return moving_avg_scores, True

def train_agent(lr, update_every, buffer_size, batch_size):
    env = gym.make('Acrobot-v1')
    print(update_every)
    print(buffer_size)
    print(batch_size)
    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n
    agent = TutorialAgent1(state_size=state_shape, action_size=action_shape, seed=0, lr=lr,
                           update_every=update_every, buffer_size=buffer_size, batch_size=batch_size)
    all_scores_1, _ = dqn(agent, env)
    regret = -np.sum(all_scores_1)
    return regret,all_scores_1

def run_training():
    config_defaults = {
        "lr": 5e-4,
        "update_every": 50,
        "buffer_size": 1e5,
        "batch_size": 64
    }
    config = wandb.init(config=config_defaults, project="Acrobot_ddqn_max_new")
    lr = config.config["lr"]
    update_every = config.config['update_every']
    batch_size = config.config["batch_size"]
    buffer_size = config.config["buffer_size"]
    regret,all_scores= train_agent(lr, update_every, buffer_size, batch_size)
    for i in all_scores:
        wandb.log({"mean_scores":i})
    wandb.log({"regret": regret})

sweep_config = {
    "method": "bayes",
    "metric": {"name": "regret", "goal": "minimize"},
    "parameters": {
        "lr": {"min": 1e-5, "max": 1e-2},
        "update_every": {"values": [5,10,20,50]},
        "buffer_size": {"values": [ 1e2,1e3, 1e5]},
        "batch_size": {"values": [32, 64, 128]}
    },
    "project": "Acrobot_ddqn_max",
    "early_terminate": {
        "type": "hyperband",
        "min_iter": 3,
        "max_iter": 100
    }
}

sweep_id = wandb.sweep(sweep_config)
wandb.agent(sweep_id, function=run_training)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: s411rkxy
Sweep URL: https://wandb.ai/rl_shobhith/Acrobot_ddqn_max/sweeps/s411rkxy


[34m[1mwandb[0m: Agent Starting Run: qoicshkd with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.0006086513456470216
[34m[1mwandb[0m: 	update_every: 100
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshobhith-v[0m ([33mrl_shobhith[0m). Use [1m`wandb login --relogin`[0m to force relogin


100
100
64
100


  if not isinstance(terminated, (bool, np.bool8)):


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▂▂▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▇▇▇▇▇▇▇█████████
regret,▁

0,1
mean_scores,-158.7
regret,79840.13361


[34m[1mwandb[0m: Agent Starting Run: 2n9lpq7r with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.006149920612765936
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


50
1000
32
50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▅▅▆▆▆▇▇██████████
regret,▁

0,1
mean_scores,-166.0
regret,89481.86035


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: u7r2ezt0 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.008870462243433639
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


20
100
32
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▂▃▄▃▃▃▄▅▅▅▅▅▅▅▆▆▅▆▇▇▇▇▇▆▆▆▇▇▇█
regret,▁

0,1
mean_scores,-457.73
regret,120085.62143


[34m[1mwandb[0m: Agent Starting Run: 9tv9tr3k with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 100000
[34m[1mwandb[0m: 	lr: 0.0067483972033951034
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01127777777777131, max=1.0)…

75
100000
64
75


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▃▄▄▄▅▅▆▆▇▇▇███████
regret,▁

0,1
mean_scores,-177.96
regret,96148.52612


[34m[1mwandb[0m: Agent Starting Run: gk9gulke with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 100000
[34m[1mwandb[0m: 	lr: 0.006163369038579299
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


50
100000
32
50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▅▅▆▆▆▇██
regret,▁

0,1
mean_scores,-235.13
regret,111426.40167


[34m[1mwandb[0m: Agent Starting Run: 7evfr0fu with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 100000
[34m[1mwandb[0m: 	lr: 0.005446725663226798
[34m[1mwandb[0m: 	update_every: 100
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888844262, max=1.0…

100
100000
64
100


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▄▄▅▅▅▆▆▇▇▇▇██
regret,▁

0,1
mean_scores,-140.56
regret,101446.32074


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 11xp4aqq with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.009142461378378534
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


50
1000
32
50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▄▄▅▅▅▆▆▇▇▇███████
regret,▁

0,1
mean_scores,-179.6
regret,95616.46093


[34m[1mwandb[0m: Agent Starting Run: 9kdreg3j with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 100000
[34m[1mwandb[0m: 	lr: 0.002940405009594966
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


20
100000
128
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇██████████
regret,▁

0,1
mean_scores,-129.76
regret,70852.76765


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: brax0ula with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.003963908174197348
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888844262, max=1.0…

50
100
32
50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▅▅▆▆▆▇▇▇█████
regret,▁

0,1
mean_scores,-196.72
regret,99346.50773


[34m[1mwandb[0m: Agent Starting Run: l6stbrtu with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.008044479597970394
[34m[1mwandb[0m: 	update_every: 100
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


100
100
128
100


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▂▂▂▃▃▃▃▄▄▆▆▆▇▇▇▇▇▇█████▇▆▅▆▆▇▇▇▇██
regret,▁

0,1
mean_scores,-490.1
regret,123339.64701


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: u7kytcof with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.0060262006909323216
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


20
100
64
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▄▄▄▅▅▅▆▇▇▇▇██
regret,▁

0,1
mean_scores,-329.37
regret,113681.50462


[34m[1mwandb[0m: Agent Starting Run: f1z8sr5i with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 100000
[34m[1mwandb[0m: 	lr: 0.00303952366824442
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

20
100000
128
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇▇▇▇▇██████████
regret,▁

0,1
mean_scores,-126.89
regret,72170.56598


[34m[1mwandb[0m: Agent Starting Run: c7nmfxop with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 100000
[34m[1mwandb[0m: 	lr: 0.0022768400364785433
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


20
100000
128
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▄▄▄▅▅▆▆▇▇▇▇████
regret,▁

0,1
mean_scores,-153.0
regret,98240.75407


[34m[1mwandb[0m: Agent Starting Run: wvf4ubjo with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.005768829983908218
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01127777777777131, max=1.0)…

75
1000
128
75


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇███████████
regret,▁

0,1
mean_scores,-163.5
regret,87806.43092


[34m[1mwandb[0m: Agent Starting Run: 44wslxat with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.006337483914626873
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


50
100
128
50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▂▂▂▂▄▄▄▅▅▅▅▆▇▇▇▇████▇▇▇▇▆▆▆▅▅▅▆▆▄
regret,▁

0,1
mean_scores,-493.84
regret,123292.58489


[34m[1mwandb[0m: Agent Starting Run: ia6omk5e with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 100000
[34m[1mwandb[0m: 	lr: 0.0075886243628161395
[34m[1mwandb[0m: 	update_every: 100
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


100
100000
32
100


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▄▅▅▅▅▅▅▆▆▇▇██
regret,▁

0,1
mean_scores,-244.58
regret,108574.51684


[34m[1mwandb[0m: Agent Starting Run: pn2ynrpc with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.00809550471156636
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


75
1000
64
75


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▆▆▇▇▇▇▇▇▇███████
regret,▁

0,1
mean_scores,-157.14
regret,85792.58858


[34m[1mwandb[0m: Agent Starting Run: s7wgx9fe with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.0005561953486946128
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


75
100
64
75


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇█████████
regret,▁

0,1
mean_scores,-142.73
regret,77678.82345


[34m[1mwandb[0m: Agent Starting Run: t8uvwis0 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.006604529194741235
[34m[1mwandb[0m: 	update_every: 100
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


100
1000
32
100


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇▇▇▇▇▇██████
regret,▁

0,1
mean_scores,-175.24
regret,87205.65471


[34m[1mwandb[0m: Agent Starting Run: x0qzxn5o with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.00830988976308639
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


20
100
32
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▃▄▄▅▆▆▇▇▆▆▆▆▆▇▇█
regret,▁

0,1
mean_scores,-478.72
regret,123235.88


[34m[1mwandb[0m: Agent Starting Run: hdj4eqn2 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.0012283437006114263
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


50
1000
32
50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▂▂▃▃▃▃▄▄▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇█████████
regret,▁

0,1
mean_scores,-139.33
regret,72096.85546


[34m[1mwandb[0m: Agent Starting Run: rpk2jlii with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.008343862385155798
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


20
100
64
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▄▅▅▆▇▇▇▇▇▇▇████▇
regret,▁

0,1
mean_scores,-437.12
regret,118073.18121


[34m[1mwandb[0m: Agent Starting Run: vellh9xs with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.0011124114168549078
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


20
100
32
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇██████
regret,▁

0,1
mean_scores,-164.14
regret,86669.15465


[34m[1mwandb[0m: Agent Starting Run: omoleyb8 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.008754723709478843
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


75
100
32
75


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▅▅▆▆▇▇▇███
regret,▁

0,1
mean_scores,-328.48
regret,113108.97736


[34m[1mwandb[0m: Agent Starting Run: b7xkgiug with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.007599940463873099
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

20
1000
128
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇▇▇▇▇███████
regret,▁

0,1
mean_scores,-156.11
regret,84029.52158


[34m[1mwandb[0m: Agent Starting Run: zh618f5s with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.0046034986545850426
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011466666666739104, max=1.0…

20
100
128
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▃▂▂▂▃▄▄▄▃▃▄▅▆▅▅▅▅▅▆▆▆▆▆▇▇▇▆▆▆▆▆▆▇▇██▇▇▇
regret,▁

0,1
mean_scores,-489.42
regret,123180.7145


[34m[1mwandb[0m: Agent Starting Run: 061ts4ld with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.005526809058085539
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


50
100
128
50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▅▄▄▄▃▄▅▅▅▅▅▅▆▆▆▆▅▅▅▅▅▅▄▅▆▅▅▆▆▇███▇▇
regret,▁

0,1
mean_scores,-491.06
regret,123688.7699


[34m[1mwandb[0m: Agent Starting Run: xtx0v8xw with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.008521382268929605
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


50
100
32
50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▇▆▆▆██▇
regret,▁

0,1
mean_scores,-468.5
regret,121933.55614


[34m[1mwandb[0m: Agent Starting Run: r10q7sj0 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.005249838120527297
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


20
1000
64
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇▇▇▇█████████
regret,▁

0,1
mean_scores,-156.5
regret,78782.71493


[34m[1mwandb[0m: Agent Starting Run: r08cxrhi with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.0056247065880243585
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

50
100
32
50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▄▄▅▆▆▆▇▇███████▇▇▇▇▇
regret,▁

0,1
mean_scores,-354.04
regret,105497.47847


[34m[1mwandb[0m: Agent Starting Run: kobh4i6g with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.007016333039463426
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


75
1000
32
75


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▄▄▄▅▅▆▆▆▇▇▇▇▇▇████████
regret,▁

0,1
mean_scores,-163.15
regret,85578.49293


[34m[1mwandb[0m: Agent Starting Run: mrx1h2d8 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.009270782930014996
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888925108, max=1.0…

75
1000
32
75


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▄▄▄▅▅▆▆▆▇▇▇█████████
regret,▁

0,1
mean_scores,-168.45
regret,88851.96182


[34m[1mwandb[0m: Agent Starting Run: 100popeg with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.0009394688774065316
[34m[1mwandb[0m: 	update_every: 100
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


100
100
64
100


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇█████
regret,▁

0,1
mean_scores,-164.64
regret,80098.25449


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7m1ylbu0 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 100000
[34m[1mwandb[0m: 	lr: 0.009987019489281528
[34m[1mwandb[0m: 	update_every: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


20
100000
64
20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇█████
regret,▁

0,1
mean_scores,-417.68
regret,117228.76625


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: zpvqms8q with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.008324345061929203
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


50
1000
64
50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▅▅▆▆▆▇▇▇▇███
regret,▁

0,1
mean_scores,-166.14
regret,99900.45588


[34m[1mwandb[0m: Agent Starting Run: zlmfshwb with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.0056321196670273
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

75
1000
32
75


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇▇███████
regret,▁

0,1
mean_scores,-162.99
regret,92614.83096


[34m[1mwandb[0m: Agent Starting Run: li4q2dsi with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 100
[34m[1mwandb[0m: 	lr: 0.009514795749931306
[34m[1mwandb[0m: 	update_every: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


50
100
32
50
