In [1]:
import math
import time
import wandb
import rpnn
import pprint
import gym

In [2]:
WANDB_PROJECT = 'cartpole'
wandb.login()

wandb: Currently logged in as: lubomirkurcak (use `wandb login --relogin` to force relogin)


True

In [3]:
sweep_config = {
    'method': 'random'
}

In [4]:
metric = {
    'name': 'averaged_timesteps',
    'goal': 'maximize'
}

sweep_config['metric'] = metric
parameters_dict = {}

In [5]:
# numerical precision params
parameters_dict.update({
    'fp_precision': {
        'distribution': 'q_uniform',
        'q': 1,
        'min': 4,
        'max': 24,
    },
    
    'fp_emax': {
        'distribution': 'q_uniform',
        'q': 1,
        'min': 4,
        'max': 127,
    },
    
    'fp_subnormal': {
        'value': 'CPFLOAT_SUBN_USE'
    },
    
    'fp_round': {
        'values': ['CPFLOAT_RND_SP',
                   'CPFLOAT_RND_SE']
    },
    
    'fp_flip': {
        'value': 'CPFLOAT_NO_SOFTERR'
    },
    
    'fp_explim': {
        'value': 'CPFLOAT_EXPRANGE_TARG'
    },
    
    'fp_p': {
        'value': 0.0
    },
})

In [6]:
# reinforcement learning (RL) params
parameters_dict.update({
    'epsilon0' : {
        'value': 1.0
    },
    'epsilon1' : {
        'value': 0.1
    },
    'exploration_steps' : {
        'value': 1000000
    },
    'epsilon' : {
        'value': 1.0
    },
    
    'gamma' : {
        'value': 0.99
    },
    'alpha' : {
        'value': 1.0
    },
    
    'target_update_frequency' : {
        'value': 10000
    },
    'replay_start_size' : {
        'value': 50000
    },
    
    'action_repeat' : {
        'value': 1
    }
})

In [7]:
# neural network params
parameters_dict.update({
    'layer_sizes': {
        'value': (4, 16, 16, 16, 2)
    },
    'epochs': {
        'value': 50000
    },
    'minibatch_size': {
        'value': 32
    },
    'learning_rate': {
        'value': 0.1
    },
    'weight_decay': {
        'value': 0.001
    },
    'dropout_keep_p': {
        'value': 1
    },
    'momentum_coefficient': {
        'value': 0
    },
})

In [8]:
sweep_config['parameters'] = parameters_dict

In [9]:
def train():
    run = wandb.init()
    config = run.config
    
    rpnn.set_precision(
        fp_precision=config["fp_precision"],
        fp_emax=config["fp_emax"],
        fp_subnormal=config["fp_subnormal"],
        fp_round=config["fp_round"],
        fp_flip=config["fp_flip"],
        fp_p=config["fp_p"],
        fp_explim=config["fp_explim"])
    
    rpnn.set_reinforcement_params(
        epsilon0=config["epsilon0"],
        epsilon1=config["epsilon1"],
        exploration_steps=config["exploration_steps"],
        epsilon=config["epsilon"],
        gamma=config["gamma"],
        alpha=config["alpha"],
        target_update_frequency=config["target_update_frequency"],
        replay_start_size=config["replay_start_size"],
        action_repeat=config["action_repeat"])
    
    rpnn.set_neuralnet_params(
        layer_sizes=tuple(config["layer_sizes"]),
        epochs=config["epochs"],
        minibatch_size=config["minibatch_size"],
        learning_rate=config["learning_rate"],
        weight_decay=config["weight_decay"],
        dropout_keep_p=config["dropout_keep_p"],
        momentum_coefficient=config["momentum_coefficient"])
    
    rpnn.cartpole_init()
    
    timesteps_left = 100000
    running_average = 0
    
    env = gym.make('CartPole-v1')
    
    while timesteps_left > 0:
        observation = env.reset()
        reward = 0
        done = False
        for t in range(100000):
            action = rpnn.cartpole(observation.tolist(), reward, done, False)
            observation, reward, done, info = env.step(action)

            if done:
                run_score = t+1
                running_average = 0.95*running_average + 0.05*run_score
                wandb.log({'timesteps': run_score, 'averaged_timesteps': running_average})
                timesteps_left -= run_score
                break
    
    env.close()
    run.finish()

In [None]:
sweep_id = wandb.sweep(sweep_config, project=WANDB_PROJECT)
wandb.agent(sweep_id=sweep_id, project=WANDB_PROJECT, function=train, count=100)

Create sweep with ID: g1i7wgy6
Sweep URL: https://wandb.ai/lubomirkurcak/cartpole/sweeps/g1i7wgy6


wandb: Agent Starting Run: 6c1t6k8h with config:
wandb: 	action_repeat: 1
wandb: 	alpha: 1
wandb: 	dropout_keep_p: 1
wandb: 	epochs: 50000
wandb: 	epsilon: 1
wandb: 	epsilon0: 1
wandb: 	epsilon1: 0.1
wandb: 	exploration_steps: 1000000
wandb: 	fp_emax: 109
wandb: 	fp_explim: CPFLOAT_EXPRANGE_TARG
wandb: 	fp_flip: CPFLOAT_NO_SOFTERR
wandb: 	fp_p: 0
wandb: 	fp_precision: 21
wandb: 	fp_round: CPFLOAT_RND_SP
wandb: 	fp_subnormal: CPFLOAT_SUBN_USE
wandb: 	gamma: 0.99
wandb: 	layer_sizes: [4, 16, 16, 16, 2]
wandb: 	learning_rate: 0.1
wandb: 	minibatch_size: 32
wandb: 	momentum_coefficient: 0
wandb: 	replay_start_size: 50000
wandb: 	target_update_frequency: 10000
wandb: 	weight_decay: 0.001
wandb: wandb version 0.12.11 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
timesteps,18.0
averaged_timesteps,19.72271
_runtime,169.0
_timestamp,1646777048.0
_step,4488.0


0,1
timesteps,▃█▂▁▂▁▂▄▂▇▁▂▂▁▃▂▂▂▁▂▃▄▂▄▁▁▄▁▃▂▁▃▃▂▂▃▂▂▂▄
averaged_timesteps,▃▆▃▄▅▅▄▅▄▆▃▂▃▅▄▄▅▆▃▄▅▅▆█▁▅▄▂▄▃▅█▄▇▅▆▅▃▃▄
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███


wandb: Agent Starting Run: z4vbek90 with config:
wandb: 	action_repeat: 1
wandb: 	alpha: 1
wandb: 	dropout_keep_p: 1
wandb: 	epochs: 50000
wandb: 	epsilon: 1
wandb: 	epsilon0: 1
wandb: 	epsilon1: 0.1
wandb: 	exploration_steps: 1000000
wandb: 	fp_emax: 116
wandb: 	fp_explim: CPFLOAT_EXPRANGE_TARG
wandb: 	fp_flip: CPFLOAT_NO_SOFTERR
wandb: 	fp_p: 0
wandb: 	fp_precision: 17
wandb: 	fp_round: CPFLOAT_RND_SP
wandb: 	fp_subnormal: CPFLOAT_SUBN_USE
wandb: 	gamma: 0.99
wandb: 	layer_sizes: [4, 16, 16, 16, 2]
wandb: 	learning_rate: 0.1
wandb: 	minibatch_size: 32
wandb: 	momentum_coefficient: 0
wandb: 	replay_start_size: 50000
wandb: 	target_update_frequency: 10000
wandb: 	weight_decay: 0.001
wandb: wandb version 0.12.11 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
timesteps,34.0
averaged_timesteps,21.83702
_runtime,343.0
_timestamp,1646777396.0
_step,4522.0


0,1
timesteps,▁▃▂▂▂▂▂▂▃▂▂▂▂▂▁▂▂▃▂▂▁▂▁▇▁▁▂▁▂█▃▁▂▂▂▂▃▂▅▃
averaged_timesteps,▂▅▇▃▄▅▅▄▄█▅▅▂▂▃▅▅▄▄▃▂▆▂▅▄▃▆▆▃▆▅▅▃▃▁▄▇▄▄▆
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███


wandb: Agent Starting Run: d9lnbuxs with config:
wandb: 	action_repeat: 1
wandb: 	alpha: 1
wandb: 	dropout_keep_p: 1
wandb: 	epochs: 50000
wandb: 	epsilon: 1
wandb: 	epsilon0: 1
wandb: 	epsilon1: 0.1
wandb: 	exploration_steps: 1000000
wandb: 	fp_emax: 85
wandb: 	fp_explim: CPFLOAT_EXPRANGE_TARG
wandb: 	fp_flip: CPFLOAT_NO_SOFTERR
wandb: 	fp_p: 0
wandb: 	fp_precision: 20
wandb: 	fp_round: CPFLOAT_RND_SE
wandb: 	fp_subnormal: CPFLOAT_SUBN_USE
wandb: 	gamma: 0.99
wandb: 	layer_sizes: [4, 16, 16, 16, 2]
wandb: 	learning_rate: 0.1
wandb: 	minibatch_size: 32
wandb: 	momentum_coefficient: 0
wandb: 	replay_start_size: 50000
wandb: 	target_update_frequency: 10000
wandb: 	weight_decay: 0.001
wandb: wandb version 0.12.11 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade
