#### RL Course by David Silver

## Lecture 5 Model-Free Control

- On-Policy Monte-Carlo Control
- On-Policy Temporal-Difference Learing
- Off-Policy Learning

In [2]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

## 1 Monte-Carlo Control

<img src="img/04_MC control.png" width=60%>

采用$\epsilon-greedy$方式学习的最终策略, 依然是一个$epsilon-greedy$的策略. 如何保证不会丢到一个更好的策略,同时当我们得到一个当前的最好策略不想继续探索的时候, 得到的这个策略是一个确定性策略,而不包含随机行为? 由此引出了GLIE的概念.

GLIE(greedy in limit with Infinite Exploration)要求学习方法有两个条件
- 充分探索: 当迭代次数趋向无穷时, 所有状态-动作对都被探索无数次
- 最优收敛: 策略收敛在一个贪婪策略,不再具有随机性

满足GLIE的方法, 就能够达到我们上述的学习目的. 
对于$\epsilon-greedy$方法而言, 当$\epsilon = \frac{1}{k}$时, 就满足GLIE.

In [9]:
def improve_greedy_policy(Q, epsilon, nA):
    def policy(state):
        prob = np.random.uniform()
        if prob < 1 - epsilon:
            return np.argmax(Q[state])
        else:
            return np.random.randint(nA)
    return policy

In [None]:
# GLIE Monte-Carlo Control
from collections import deque, Counter, defaultdict


def GLIE_MC(MDP, episods, gamma = 1.):
    '''GLIE Monte-Carlo control method. Use epsilon-greedy for improvement with a decent epsilon.
    Params:
        MDP: an instance of MDP
        episods: number of episod
        gamma: decay of reward
    
    Return:
        Q, A defaultdict(np.ndarray)
        Q[state][value]: the estimate expect of state-action pair
    '''
    N = defaultdict(lambda: np.zeros(len(MDP.action_space)))
    Q = defaultdict(lambda: np.zeros(len(MDP.action_space)))
    
    for i_episod in range(1, episods+1):
        if i_episod % 100000 == 0:
            print("i_episod:%d/%d"%(i_episod, episods))
        episod = deque()
        G = 0
        s = MDP.reset()
        policy = improve_greedy_policy(Q, 1./i_episod, len(MDP.action_space))
        done = False
        while not done: # run the simulation
            a = policy(s)
            s_n, r, done, _ = MDP.step(a)
            episod.append((s, a, r))
            s = s_n
        while len(episod) != 0: # update value evaluation of every-visit
            s, a, r = episod.pop()
            G = r + gamma * G
            N[s][a] += 1
            Q[s][a] += (G - Q[s][a])/N[s][a]
        policy = improve_greedy_policy(Q, 1./i_episod, len(MDP.action_space)) # update epsilon for descent.
    return Q

In [6]:
# Sarsa Algorithm
from collections import defaultdict

def Sarsa(MDP, episods, gamma = 1, alpha = 0.1, epsilon = 0.3):
    '''Sarsa Algorithm for On-policy Control
    
    Params:
        MDP: an instance of MDP
        episods: number of episod
        gamma: decay of reward
        alpha: parameter of TD-learning, learning rate
        epsilon: parameter of epsilon-greedy
        
    Return:
        Q, A defaultdict(np.ndarray)
        Q[state][value]: the estimate expect of state-action pair
    '''
    Q = defaultdict(lambda: np.zeros(len(MDP.action_space)))
    policy = improve_greedy_policy(Q, epsilon, len(MDP.action_space))
    
    for i_episod in range(1, episods+1):
        if i_episod % 100000 == 0:
            print("\ri_episod:%d/%d"%(i_episod, episods), end="")
        s = MDP.reset()
        done = False
        a = policy(s)
        while not done:
            s_next, r, done, _ = MDP.step(a)
            if not done:
                a_next = policy(s_next)
                Q[s][a] += alpha * (r + Q[s_next][a_next] - Q[s][a])
                s = s_next
                a = a_next
            else:
                Q[s][a] += alpha * (r + 0 - Q[s][a]) # Q(terminate-state,.) = 0
    return Q