In [1]:
import sys
sys.path.append('..')

import gym
import numpy as np
from sklearn.linear_model import SGDRegressor

import rlutils

In [2]:
env_cp = gym.make('CartPole-v0')

# Random Search

In [10]:
class LinearBinAgent:
    
    def __init__(self, params):
        self.params = params
        
    def __call__(self, s, _):
        return int(s @ self.params > 0)

def random_search(env, nevals, niters):
    space_len = len(env.observation_space.low)
    best_params = None
    best_val = float('-inf')
    agent = LinearBinAgent(best_params)
    
    for _ in range(niters):
        agent.params = np.random.randn(space_len)
        val = rlutils.run_nepisodes(env, agent, nevals).mean()
        if val > best_val:
            best_val = val
            best_params = agent.params
    
    return best_params
    

rand_params = random_search(env_cp, 20, 20)
agent = LinearBinAgent(rand_params)
scores = rlutils.run_nepisodes(env_cp, agent, 1000)
print('CP: min = {}, avg = {}, max = {}'.format(scores.min(),
                                               scores.mean(),
                                               scores.max()))

CP: min = 35.0, avg = 86.185, max = 200.0


# Value function (prediction) approximation

Function approximators of any kind, must have a few properties:
- diferentiable
- suitable for non-stationary, non-iid data

Let's define $J(w)$ a differentable function of parameters $w$, that is the mean squared error between our approximator $\hat{v}(s,w)$ and the true value $v_\pi(s)$:
$$J(w) = \mathbb{E}_\pi[(v_\pi(S) - \hat{v}(S,w))^2]$$

We represent the current state $S$ by a feature vector of size $p$

We use mini-batch versions of gradient descent, where we compute $J(w)$ for only one example and update $w$.  
But we don't have the true value $v_\pi(S)$, so we use instead another target, depending on the algorithm

## Monte-Carlo Policy Evaluation

For Monte-Carlo, the target is the return $G_t$.  
Converges to a local optimum, even with a non-linear model (MC target is unbiased)

In [14]:
def mc_policy_eval(env, agent, ngames):
    
    model = SGDRegressor()
    errs = []
    t = 0
    
    for _ in range(ngames):
        hist = []
        s = env.reset()
    
        while True:
            a = agent(s, env)
            s2, r, done, _ = env.step(a)
            hist.append((s, r))
            if done: break
            s = s2
        
        gt = 0
        for h in reversed(hist):
            t += 1
            gt += h[1]
            X = h[0].reshape(1, -1)
            y = np.array(gt).reshape(1)
            model.partial_fit(X, y)
            
            err = (model.predict(X) - y)**2
            errs.insert(0, err)
            if len(errs) > 1000:
                errs = errs[:-1]
                    
            if t % 2500 == 0:
                avg_err = np.average(errs)
                print(avg_err)
            
            
            
    return model
    
mc_policy_eval(env_cp, agent, 1000)

2829.758752735302
2685.9851884703457
1835.917598220721
3410.1158538863015
1747.4012468590981
1493.0163352441248
924.1118637504383
3325.9360769412006
3371.96418963111
1010.8292515726602
3796.1706153056157
2043.0087477062614
2673.6764985688746
2673.5953357922813
2786.8431350028413
2873.1682410692465
2704.785889418083
2979.576458885066
1965.3967306509744
1767.2035303637804
1800.7121455442812
3484.811832800219
3408.2853806922417
1516.114140470016
2069.7024665759845
1241.6684554080075
1733.8290322347623
2053.947021865621
2057.208426759222
2144.441439891937
3672.199326209127
2734.7904070852446
2145.163943531712
2272.385197598346
3241.992166084059


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

## TD(0) Policy Evaluation

The target is $R_{t+1} + \gamma \hat{v}(S_{t+1}, w)$
The TD target is biased.  
Linear TD(0) converges (close) to a global optimum

In [15]:
def td0_policy_eval(env, agent, ngames, gamma):
    
    model = SGDRegressor()
    errs = []
    t = 0
    
    for _ in range(ngames):
        s = env.reset()
    
        while True:
            a = agent(s, env)
            s2, r, done, _ = env.step(a)

            X = s.reshape(1, -1)
            y = r + gamma * model.predict(s2)
            model.partial_fit(X, y)
            
            err = (model.predict(X) - y)**2
            errs.insert(0, err)
            if len(errs) > 1000:
                errs = errs[:-1]   
            t += 1
            if t % 2500 == 0:
                avg_err = np.average(errs)
                print(avg_err)
            
            if done: break
            s = s2
            
            
    return model
    
mc_policy_eval(env_cp, agent, 1000)

1701.134816367266
3646.335627123368
839.5511643556789
2154.7679940988382
1027.9382945818902
2269.6368781070537
3974.9207544332353
1233.242841629993
1841.0933387292162
2800.099859596742
3295.129335320453
2130.9714386493956
2924.741234742127
1562.9061642140964
2156.2298001248373
1137.373371447198
3391.2598715362287
1787.46950022757
2314.708545602369
2026.2536344007476
2382.971007198548
1739.0297597911301
1555.8770066061688
1947.3102022324301
2187.5696471780816
3159.152537429964
3055.3431110988013
3314.0073389017007
3284.803197901781
1105.6177850455088
3693.09590479369
1971.6611633987106
664.9511510224081
2637.992298591326


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

## TD($\lambda$) policy evaluation

The target is the $\lambda$-return $G^\lambda_t$

The backward view can also be used with eligibilaty traces

# Control with function approximation

Repeat:
- Evaluation: Approximate $q_\pi$ with $\hat{q}(s,a,w)$
- Improvement: act with this new policy to chose actions

The error function now becomes:

$$J(w) = \mathbb{E}_\pi[(q_\pi(S,A) - \hat{q}(S,A,w))^2]$$

We represent the state and action (S,A) by a feature vector.

We replace $q_\pi(S,A)$ by another target depending on the algorithm.  

Most control algorithms have no theoric garanties of convergence, you may end up turn around the optimal policy, or even diverges from it.

# Batch Methods

## Least Squares Prediction

Let $\mathcal{D}$ our experience consisting of (state, value) pairs:
$$\mathcal{D} = \{ (s_1, v_1^\pi), (s_2, v_2^\pi), \text{...}, (s_T, v_T^\pi) \}$$

We can find the least squares solution that minimizes the mse between our prediction and the true state values:

$$LS(w) = \sum_{t=1}^T (v_t^\pi - \hat{v}(s_t, w))^2$$

Experience Replay:  
Repeat:
- Store the new visited (state, value) pair in $\mathcal{D}$
- sample one $(s, v^\pi) \sim \mathcal{D}$ and update $\hat{v}$ with gradient descent.

Helps decorelate things, experiences seen in random order. Converges to least squares solution

# Deep Q Network

Combine 2 ideas:

- Experience replay:  decorelates the trajectories, get much more stable updates.  
- fixed Q-targets: Have 2 networks, freeze the old network weights, bootstrap toward these fixed targets. Every few thousands iteration, the old network weights are updated with the new one. Actions are chosen according to the latest targets.

Repeat:
- Take action $a_t$ according to $\epsilon$-greedy policy
- Store transition $(s_t, a_t, r_{t+1}, s_{t+1}$ in $\mathcal{D}$
- Sample mini-batch $(s, a, r, s') \sim \mathcal{D}$ (eg size 64)
- Optimise MSE between prediction of frozen network and current prediction:
    $$L(w) = \mathbb{E}[(r + \gamma \max_{a'} Q(s',a',w^-) - Q(s,a,w))^2]$$

# Linear Least Squares Prediction

Experiences replay find least squares solution on many iterations, but we can solve in closed form.

The solution is the same than for classical least squares:

$$w = (S^TS)^{-1} S^Tv_\pi$$

with $S$ a matrix whose rows are state-vectors, and $v_\pi$ a vector of corresponding value functions.

# Least Squares Policy Iteration

Repeat:
- Policy evalutation: fit $q(S,A)$ using least squares solution
- Policy improvement: greedy policy