In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import gym
import numpy as np
from tqdm.auto import tqdm, trange

from car import Agent, BasisFunction, Hyperparameters, train_episode, WeightedFunction

<IPython.core.display.Javascript object>

In [3]:
environment = gym.make("MountainCar-v0")

<IPython.core.display.Javascript object>

In [4]:
make_weighted = lambda coef, weight: WeightedFunction(
    basis_function=BasisFunction(environment=environment, coefficients=np.array(coef)),
    hyperparameters=Hyperparameters(
        discount=0.99,
        forgetting=0.99,
        learning_rate=0.5,
    ),
    weight=weight,
    eligibility=0,
)

make_guess = lambda offset, skew: Agent(
    environment=environment,
    hyperparameters=Hyperparameters(
        discount=0.99,
        forgetting=0.99,
        learning_rate=0.5,
    ),
    random_action_probability=0.05,
    weighted_functions={
        0: [
            make_weighted([0, 0], max(offset, 0)),
            make_weighted([1, 0], skew),
            make_weighted([0, 1], -1),
        ],
        1: [],
        2: [
            make_weighted([0, 0], max(-offset, 0)),
            make_weighted([1, 0], -skew),
            make_weighted([0, 1], 1),
        ],
    },
)

<IPython.core.display.Javascript object>

In [5]:
viable = []
for offset in tqdm(np.linspace(-0.5, 0.5)):
    for skew in np.linspace(-0.5, 0.5):
        agent, total_reward = train_episode(environment, agent=make_guess(offset, skew))
        if total_reward > -200:
            viable.append([offset, skew])

viable

  0%|          | 0/50 [00:00<?, ?it/s]

[[-0.47959183673469385, 0.33673469387755095],
 [-0.35714285714285715, 0.2142857142857142],
 [-0.2755102040816327, 0.4591836734693877],
 [-0.1938775510204082, 0.15306122448979587],
 [-0.15306122448979592, -0.030612244897959218],
 [-0.11224489795918369, -0.05102040816326536],
 [-0.0918367346938776, -0.010204081632653073],
 [-0.07142857142857145, 0.15306122448979587],
 [-0.030612244897959218, -0.05102040816326536],
 [-0.030612244897959218, 0.29591836734693866],
 [-0.030612244897959218, 0.3163265306122448],
 [0.2142857142857142, -0.1938775510204082],
 [0.23469387755102034, 0.15306122448979587],
 [0.29591836734693866, 0.15306122448979587],
 [0.3571428571428571, 0.3979591836734693],
 [0.3979591836734693, 0.11224489795918358],
 [0.4183673469387754, -0.17346938775510207],
 [0.4183673469387754, 0.0714285714285714],
 [0.4183673469387754, 0.19387755102040816],
 [0.43877551020408156, -0.0918367346938776],
 [0.47959183673469385, -0.3163265306122449],
 [0.5, 0.2551020408163265]]

<IPython.core.display.Javascript object>

In [6]:
scores = {
    (offset, skew): np.mean(
        [
            train_episode(environment, agent=make_guess(offset, skew))[1]
            for attempt in range(100)
        ]
    )
    for offset, skew in tqdm(viable)
}
best_params = max(scores.keys(), key=lambda params: scores[params])
print(
    f"Best offset={best_params[0]}, skew={best_params[1]}, score={scores[best_params]}"
)

  0%|          | 0/22 [00:00<?, ?it/s]

Best offset=0.5, skew=0.2551020408163265, score=-199.31


<IPython.core.display.Javascript object>

In [7]:
success_frequency = np.mean(
    [
        train_episode(environment, agent=make_guess(*best_params))[1] > -200
        for attempt in trange(1000)
    ]
)
success_frequency

  0%|          | 0/1000 [00:00<?, ?it/s]

0.007

<IPython.core.display.Javascript object>