In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..

/home/nassim/dev/conrl


In [3]:
from src.conrl import ConRL
from src.qlearning import QLearningAgent
from src.utils import *

import time
import sys
from collections import defaultdict
import itertools

import numpy as np
import pandas as pd
import wandb
import gym

import matplotlib.pyplot as plt
import matplotlib.collections as mc

%matplotlib inline
plt.style.use('seaborn')
np.set_printoptions(precision=3, linewidth=80)

In [4]:
wandb.login()

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33mdodicin[0m (use `wandb login --relogin` to force relogin)


True

In [5]:
env = gym.make('MountainCar-v0')
state_size = (10, 10)
window_size = (env.observation_space.high - env.observation_space.low)/state_size
num_episodes = 500
max_step = 1000
env._max_episode_steps = max_step

q_params = {
    "gamma": 0.9,
    "alpha": 0.1,
    "alpha_decay_rate": 0,
    "min_alpha": 0.1,
    "epsilon": 1.0,
    "epsilon_decay_rate": 0,
    "min_epsilon": 0.1
}

q_params["epsilon_decay_rate"] = (q_params["epsilon"] - q_params["min_epsilon"])/(num_episodes//2)
q_params["alpha_decay_rate"] = (q_params["alpha"] - q_params["min_alpha"])/(num_episodes//2)

mlgng_params = {
    "ndim": 2, 
    "e_w":0.5, 
    "e_n":0.1, 
    "l":10, 
    "a":0.5, 
    "b":1-0.05, # Java impl. does it like this
    "k":1000.0, 
    "max_nodes": 10, 
    "max_age": 10
}

In [6]:
# For plotting
act_dict = {
    0: "Push left",
    1: "No push",
    2: "Push right"
}

act_symbol = {
    0:  "o",
    1: "^",
    2: "s",
}

act_color = cm.Dark2(np.linspace(0.1, 1, 3, endpoint=False))

In [11]:
run = wandb.init(
  entity="dodicin",
  project="con-rl",
  notes="test",
  tags=["q-learning", "mlgng"],
  config={"q_params": q_params,
        "mlgng_params": mlgng_params})

def wandb_log():
   wandb.log({
        'reward': stats_cr.episode_rewards[episode], 
        'steps': stats_cr.episode_lengths[episode],
        'selector': np.mean(stats_cr.selector_dist[episode]),
        'global_error': conrl.mlgng.get_last_stat_tuple("global_error")
        })

    data = conrl.mlgng.get_nodes()
    if data.shape[1]>1:
        table = wandb.Table(data=data.T.tolist(), columns = ["position", "velocity", "action"])
        wandb.log({"nodes" : wandb.plot.scatter(table, "position", "velocity", "action")})

    length = state_size[0]*state_size[1]
    conrl_state_actions = np.zeros((length, 3))

    for idx in range(length):
        state = np.unravel_index(idx, state_size)
        best_a, _, _, _ = conrl._simple_action_selector(state)
        conrl_state_actions[idx] = state + (best_a, )

    table = wandb.Table(data=conrl_state_actions.tolist(), columns = ["position", "velocity", "action"])
    wandb.log({"best_actions" : wandb.plot.scatter(table, "position", "velocity", "action")})

[34m[1mwandb[0m: wandb version 0.10.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [12]:
stats_cr = EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes),
        selector_dist=np.zeros((num_episodes, max_step)).astype(int))

conrl = ConRL(action_size=env.action_space.n, state_size=state_size, update_threshold=10)
conrl.init_support(**q_params)
conrl.init_mlgng(**mlgng_params)

for episode in range(num_episodes):
    done = False
    success = False
    step = 0

    start = time.time()
    obs = env.reset()

    state = get_discrete_state(obs, window_size, env)
    while not done:
        next_state, reward, done, selected = conrl.step(state, env, window_size=window_size, discretize=get_discrete_state)
        state = next_state
        
        # Stats logging
        stats_cr.episode_rewards[episode] += reward
        stats_cr.episode_lengths[episode] = step
        stats_cr.selector_dist[episode][step] = selected

        step+=1
        if step >= max_step:
            break
    
    conrl.support.decay_epsilon(episode)

    # Wandb logging
    # wandb_log()

    if episode % 100 == 0:
        print("Episode {}/{}, Reward {}, Total steps {}, Epsilon: {:.2f}, Alpha: {:.2f}, Time {:.3f}".format(episode, num_episodes, stats_cr.episode_rewards[episode], stats_cr.episode_lengths[episode], conrl.support.epsilon, conrl.support.alpha, time.time()-start))
        conrl.mlgng.print_stats(one_line=True)

Episode 0/500, Reward -1000.0, Total steps 999.0, Epsilon: 1.00, Alpha: 0.10, Time 0.211
	 MLGNG nodes per action layer: 0 0 0
Episode 100/500, Reward -1000.0, Total steps 999.0, Epsilon: 0.64, Alpha: 0.10, Time 1.051
	 MLGNG nodes per action layer: 4 3 3
Episode 200/500, Reward -450.0, Total steps 449.0, Epsilon: 0.28, Alpha: 0.10, Time 0.597
	 MLGNG nodes per action layer: 10 10 10
Episode 300/500, Reward -160.0, Total steps 159.0, Epsilon: 0.10, Alpha: 0.10, Time 0.395
	 MLGNG nodes per action layer: 10 10 9
Episode 400/500, Reward -391.0, Total steps 390.0, Epsilon: 0.10, Alpha: 0.10, Time 0.823
	 MLGNG nodes per action layer: 10 10 10


In [13]:
 run.finish()

0,1
reward,-233.0
steps,232.0
selector,0.233
_step,1488.0
_runtime,457.0
_timestamp,1605010997.0


0,1
reward,▁▁▁▁▁▁▁▁▁█▁▁▁▁▆▆▇▇▁▆▇▆▄▁▇▆██▇▅▇█▅█▁█▆▇▇█
steps,█████████▁████▃▃▂▂█▃▂▃▅█▂▃▁▁▂▄▂▁▄▁█▁▃▂▂▁
selector,▁████████▂████▄▄▃▃█▄▃▄▅█▃▃▂▂▃▅▃▂▅▂█▂▄▃▃▂
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
