# Value Function Approximation

In [1]:
import gym
import numpy as np
from sklearn.linear_model import SGDRegressor

In [2]:
env_cp = gym.make('CartPole-v0')

### Random Search

In [3]:
def eval_episode(env, params):
    
    s = env.reset()
    score = 0
    
    while True:
        a = int(np.dot(s, params) > 0)
        s, r, done, _ = env.step(a)
        score += r
        if done: break
            
    return score

def eval_episodes(env, params, n):
    avg = 0
    for _ in range(n):
        avg += eval_episode(env, params)
    return avg / n

def random_search(env, nevals, niters):
    space_len = len(env_cp.observation_space.low)
    best_params = None
    best_val = float('-inf')
    
    for _ in range(niters):
        params = np.random.randn(space_len)
        val = eval_episodes(env, params, nevals)
        if val > best_val:
            best_val = val
            best_params = params
    
    return best_params
    

rand_params = random_search(env_cp, 100, 20)
print(eval_episodes(env_cp, rand_params, 1000))

173.193


In [4]:
class LinearAgent:
    
    def __init__(self, params):
        self.params = params
        
    def get_action(self, state):
        return int(np.dot(state, self.params) > 0)

rand_agent = LinearAgent(rand_params)

### Monte-Carlo Policy Evaluation

In [5]:
def mc_policy_eval(env, agent, ngames):
    
    space_len = len(env_cp.observation_space.low)
    w = np.random.randn(space_len)
    
    model = SGDRegressor()
    
    errs = []
    t = 0
    
    for _ in range(ngames):
        hist = []
        s = env.reset()
    
        while True:
            a = agent.get_action(s)
            s2, r, done, _ = env.step(a)
            hist.append((s, r))
            if done: break
            s = s2
        
        gt = 0
        for h in reversed(hist):
            t += 1
            gt += h[1]
            X = h[0].reshape(1, -1)
            y = np.array(gt).reshape(1)

            #print(model.coef_)
            
            if hasattr(model, 'coef_'):
                err = (model.predict(X) - y)**2
                
                errs.insert(0, err)
                if len(errs) > 1000:
                    errs = errs[:-1]
                    
                if t % 2500 == 0:
                    avg_err = np.average(errs)
                    print(avg_err)
            
            model.partial_fit(X, y)
            
    return model
    
mc_policy_eval(env_cp, rand_agent, 1000)

2581.1303337730683
2979.6153222180064
2916.907838353903
3168.62935977055
2544.119967973882
2497.628668954892
3007.1863764520526
3041.5115940508626
2617.0269577897407
2326.8572142882504
2339.346790498743
2980.76642087632
3293.7857722817575
2776.696410838806
2768.510575879815
2390.3746318024228
2746.7058138191533
2440.213038679841
2707.3482849840343
2448.0002729960943
2346.0587353579126
2573.4307564497103
3128.2397105527402
2590.458034702129
3262.5552125326562
2853.164975976686
2686.217303295113
2704.4355294467377
2731.4641719718857
2511.3752339445114
2692.1491297501207
2767.523700423666
2836.8820328576653
2865.798935943203
3069.0103344934237
2825.15269942996
2872.3911674859187
2919.9872568055694
2985.4696217113724
2864.439122448238
2310.3512011114694
2651.274735135973
2635.570201959676
2485.0916654924386
2687.7401538396684
2718.6698954374547
2991.136290135467
2624.196095880515
2644.431129169888
2309.087876197769
2834.1214207452285
2756.4271485843983
2094.3853093429984
2897.0662621176607

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)