# Approximate Temporal Difference

In here we use the following formulation to approximate the action-value function.

$
Q(x, u) = \phi^T\mu(x, u)
$

- For estimating $\phi$, we'll use SGD method.
- For $\mu$ in here we'll use RBFSampler from sklearn.kernel_approximation.

**Note:** It seems that it doesn't work!!!

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from tqdm import tqdm 
from sklearn.kernel_approximation import RBFSampler

from Lib.BasicPolicyRelatedTools import EpsilonGreedyPolicy

In [None]:
class QLinModel:
    def __init__(self, env:gym.Env, policy : any, alpha = 0.05):
        samples = self.gather_samples(env, 10000)
        self.mu = RBFSampler()
        self.mu.fit(samples)
        
        self.n_mu     = self.mu.n_components
        self.n_action = env.action_space.n
        
        self.phi = np.zeros(self.n_mu)
        self.policy = policy
        self.alpha = alpha
        pass
    
    def get_action_values(self, state):
        Q_out = []
        for a in range(self.n_action):
            Q_out += [self.get_action_value(state, a)]
        return Q_out
    
    def get_action_value(self, state, action):
        state_action_vec = np.concatenate((state, [action]))
        return self.phi.dot(self.mu.transform([state_action_vec])[0])
    
    def get_max_action_value(self, state):
        Q_max = -np.inf
        for a in range(self.n_action):
            Q = self.get_action_value(state, a)
            Q_max = Q if Q > Q_max else Q_max
        return Q_max
    
    def get_action(self, state):
        return self.policy.get_action(self.get_action_values(state))
    
    def update_step(self, state, action, target):
        # target = r + gamma * max(Q(x, ...))
        td_error = target - self.get_action_value(state, action)
        self.phi += self.alpha * td_error * self.get_grad(state, action)
    
    def gather_samples(self, env, n_episodes=1000):
        samples = []
        for _ in tqdm(range(n_episodes), desc="Gathering samples...   "):
            s, _ = env.reset()
            done = False
            while not done:
                a = env.action_space.sample()
                sa = np.concatenate((s, [a]))
                samples.append(sa)
                s, r, done, _, _ = env.step(a)
        return samples
    
    def get_grad(self, state, action):
        state_action_vec = np.concatenate((state, [action]))
        return self.mu.transform([state_action_vec])[0]

In [None]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
Q = QLinModel(env, EpsilonGreedyPolicy(0.1), 0.2)

In [None]:
def train_the_model(model : QLinModel, env, n_iter):
    gamma = 0.9
    
    episode_reward     = np.zeros(n_iter)
    episode_time_steps = np.zeros(n_iter)
    
    
    for iter in tqdm(range(n_iter), desc="Training...   "):
        s, _ = env.reset()
        done = False
        trunc = False
        while not done and not trunc and episode_time_steps[iter] < 10000:
            a = model.get_action(s)
            s_next, r, done, trunc, _ = env.step(a)
            
            target = r + model.get_max_action_value(s_next)
            model.update_step(s, a, target)
            
            episode_reward[iter] += r
            episode_time_steps[iter] += 1
            
            s = s_next
    
    return model, episode_reward, episode_time_steps

def run_simulation(model : QLinModel, env):
    s, _ = env.reset()
    done = False
    trunc = False
    while not done and not trunc:
        a = model.get_action(s)
        _, _, done, trunc, _ = env.step(a)
        env.render()    

In [None]:
Q.alpha = 0.6
for i in range(10):
    Q, rewards, _ = train_the_model(Q, env, 150)
    Q, rewards, _ = train_the_model(Q, env, 50)
    print(f"{i+1}. mean reward : {np.mean(rewards)}, alpha:{Q.alpha}")
    Q.alpha = np.max((0.4, Q.alpha*0.9))

In [None]:
run_simulation(Q, gym.make("CartPole-v1", render_mode="human"))