In [1]:
import sys
sys.path.append('../../pyutils')

import gym
import numpy as np
from sklearn.linear_model import SGDRegressor

import rlutils

In [2]:
env_cp = gym.make('CartPole-v0')

# Random Search

In [3]:
class LinearBinAgent:
    
    def __init__(self, params):
        self.params = params
        
    def __call__(self, s, _):
        return int(s @ self.params > 0)

def random_search(env, nevals, niters):
    space_len = len(env.observation_space.low)
    best_params = None
    best_val = float('-inf')
    agent = LinearBinAgent(best_params)
    
    for _ in range(niters):
        agent.params = np.random.randn(space_len)
        val = rlutils.run_nepisodes(env, agent, nevals).mean()
        if val > best_val:
            best_val = val
            best_params = agent.params
    
    return best_params
    

rand_params = random_search(env_cp, 20, 20)
agent = LinearBinAgent(rand_params)
scores = rlutils.run_nepisodes(env_cp, agent, 1000)
print('CP: min = {}, avg = {}, max = {}'.format(scores.min(),
                                               scores.mean(),
                                               scores.max()))

CP: min = 87.0, avg = 171.307, max = 200.0


# Value function (prediction) approximation

Function approximators of any kind, must have a few properties:
- diferentiable
- suitable for non-stationary, non-iid data

Let's define $J(w)$ a differentable function of parameters $w$, that is the mean squared error between our approximator $\hat{v}(s,w)$ and the true value $v_\pi(s)$:
$$J(w) = \mathbb{E}_\pi[(v_\pi(S) - \hat{v}(S,w))^2]$$

We represent the current state $S$ by a feature vector of size $p$

We use mini-batch versions of gradient descent, where we compute $J(w)$ for only one example and update $w$.  
But we don't have the true value $v_\pi(S)$, so we use instead another target, depending on the algorithm

## Monte-Carlo Policy Evaluation

For Monte-Carlo, the target is the return $G_t$.  
Converges to a local optimum, even with a non-linear model (MC target is unbiased)

In [4]:
def mc_policy_eval(env, agent, ngames):
    
    model = SGDRegressor()
    errs = []
    t = 0
    
    for _ in range(ngames):
        hist = []
        s = env.reset()
    
        while True:
            a = agent(s, env)
            s2, r, done, _ = env.step(a)
            hist.append((s, r))
            if done: break
            s = s2
        
        gt = 0
        for h in reversed(hist):
            t += 1
            gt += h[1]
            X = h[0].reshape(1, -1)
            y = np.array(gt).reshape(1)
            model.partial_fit(X, y)
            
            err = (model.predict(X) - y)**2
            errs.insert(0, err)
            if len(errs) > 1000:
                errs = errs[:-1]
                    
            if t % 2500 == 0:
                avg_err = np.average(errs)
                print(avg_err)
            
            
            
    return model
    
mc_policy_eval(env_cp, agent, 1000)

2222.9839788157083
2668.7041224608897
2455.020153469477
2915.9490496099256
2685.224383716276
1859.5986423923582
3302.362172150254
2931.7617508853823
2952.5983221632086
3068.6553344258436
2744.6489113341922
2381.9597832069917
2532.4456062821523
3003.8020752743646
2656.779099059067
2502.250387491962
2200.384130671096
2676.3129997521937
2821.45419561345
2708.7754689923127
2735.6461625692846
2853.1262552445146
2595.6876204834657
3126.4357920243683
2419.029046890921
2900.285692075311
2510.351061496944
2949.523381632543
2751.023871900214
3172.4880944726624
3029.145226140337
2894.52079625702
2848.699167040335
3053.6215376844184
2904.504598180444
3224.403642074639
2806.677350343172
2793.7601456128373
2783.0157752818286
2689.4701124823705
2986.037927008752
2878.628764174217
2958.810034855643
2886.193644493987
2566.217547400328
2188.264772624148
3012.3383780057998
2702.709973465776
2940.009285085867
1915.0527416449168
1963.0576184094054
3005.7025805025683
2737.7190329642744
2585.9505406365342
30

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

## TD(0) Policy Evaluation

The target is $R_{t+1} + \gamma \hat{v}(S_{t+1}, w)$
The TD target is biased.  
Linear TD(0) converges (close) to a global optimum

In [5]:
def td0_policy_eval(env, agent, ngames, gamma):
    
    model = SGDRegressor()
    errs = []
    t = 0
    
    for _ in range(ngames):
        s = env.reset()
    
        while True:
            a = agent(s, env)
            s2, r, done, _ = env.step(a)

            X = s.reshape(1, -1)
            y = r + gamma * model.predict(s2)
            model.partial_fit(X, y)
            
            err = (model.predict(X) - y)**2
            errs.insert(0, err)
            if len(errs) > 1000:
                errs = errs[:-1]   
            t += 1
            if t % 2500 == 0:
                avg_err = np.average(errs)
                print(avg_err)
            
            if done: break
            s = s2
            
            
    return model
    
mc_policy_eval(env_cp, agent, 1000)

2929.3806627094577
2536.069613915791
2716.9955856298675
2659.467106828778
3097.8476502904955
2685.2702800208676
3059.4171491642055
2941.128630659144
2706.5979398089053
2835.3461886174373
2764.4115695295773
2708.51249682176
2716.940152961865
2682.3102400146463
2193.845104487695
2971.5180241913886
3083.6943362615257
3109.3212484194655
3134.8465989047213
2861.682462079065
2648.3090971474703
3099.1919961558374
3163.758287217965
2314.532254367175
3029.907757051923
2905.28825745116
3077.177161614406
2212.0198864359677
3104.0254175961686
3432.6158533678235
2649.624086969636
3189.5057555684
2879.5580969364296
2834.3751645614657
3490.4156355703985
3093.590711235646
2364.7727391396566
2379.5455121755617
2564.6332956206006
2226.573904261301
3237.4471613068204
3168.064927317947
2659.740145081059
2704.7619903393415
2969.9549880612353
2589.176294028946
3151.644104341038
3117.8428806035004
2355.7625943214184
2825.629133007976
2663.154494282099
3180.0152240084553
2941.929359821294
3147.7635246222812
3

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

## TD($\lambda$) policy evaluation

The target is the $\lambda$-return $G^\lambda_t$

The backward view can also be used with eligibilaty traces

# Control with function approximation

Repeat:
- Evaluation: Approximate $q_\pi$ with $\hat{q}(s,a,w)$
- Improvement: act with this new policy to chose actions

The error function now becomes:

$$J(w) = \mathbb{E}_\pi[(q_\pi(S,A) - \hat{q}(S,A,w))^2]$$

We represent the state and action (S,A) by a feature vector.

We replace $q_\pi(S,A)$ by another target depending on the algorithm.  

Most control algorithms have no theoric garanties of convergence, you may end up turn around the optimal policy, or even diverges from it.

# Batch Methods

## Least Squares Prediction

Let $\mathcal{D}$ our experience consisting of (state, value) pairs:
$$\mathcal{D} = \{ (s_1, v_1^\pi), (s_2, v_2^\pi), \text{...}, (s_T, v_T^\pi) \}$$

We can find the least squares solution that minimizes the mse between our prediction and the true state values:

$$LS(w) = \sum_{t=1}^T (v_t^\pi - \hat{v}(s_t, w))^2$$

Experience Replay:  
Repeat:
- Store the new visited (state, value) pair in $\mathcal{D}$
- sample one $(s, v^\pi) \sim \mathcal{D}$ and update $\hat{v}$ with gradient descent.

Helps decorelate things, experiences seen in random order. Converges to least squares solution

# Deep Q Network

Combine 2 ideas:

- Experience replay:  decorelates the trajectories, get much more stable updates.  
- fixed Q-targets: Have 2 networks, freeze the old network weights, bootstrap toward these fixed targets. Every few thousands iteration, the old network weights are updated with the new one. Actions are chosen according to the latest targets.

Repeat:
- Take action $a_t$ according to $\epsilon$-greedy policy
- Store transition $(s_t, a_t, r_{t+1}, s_{t+1}$ in $\mathcal{D}$
- Sample mini-batch $(s, a, r, s') \sim \mathcal{D}$ (eg size 64)
- Optimise MSE between prediction of frozen network and current prediction:
    $$L(w) = \mathbb{E}[(r + \gamma \max_{a'} Q(s',a',w^-) - Q(s,a,w))^2]$$

# Linear Least Squares Prediction

Experiences replay find least squares solution on many iterations, but we can solve in closed form.

The solution is the same than for classical least squares:

$$w = (S^TS)^{-1} S^Tv_\pi$$

with $S$ a matrix whose rows are state-vectors, and $v_\pi$ a vector of corresponding value functions.

# Least Squares Policy Iteration

Repeat:
- Policy evalutation: fit $q(S,A)$ using least squares solution
- Policy improvement: greedy policy