In [1]:
import gym
import numpy as np
from numba import jit
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
from contextlib import closing
from io import StringIO
from os import path
from typing import Optional

import numpy as np

from gym import Env, logger, spaces
from gym.envs.toy_text.utils import categorical_sample
from gym.error import DependencyNotInstalled

UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3

class CliffWalkingEnv(Env):
    metadata = {
        "render_modes": ["human", "rgb_array", "ansi"],
        "render_fps": 4,
    }

    def __init__(self, render_mode: Optional[str] = None, init_mode = 'original'):
        self.shape = (3, 3)
        self.start_state_index = np.ravel_multi_index((2, 0), self.shape)
        self.init_mode = init_mode
        self.nS = np.prod(self.shape)
        self.nA = 4
        
        # Define Cliff Location
        self._cliff = np.zeros(self.shape, dtype=bool)
        self._cliff[2, 1] = True
        
        
        # Define Locations above cliff
        self._abovecliff = np.zeros(self.shape, dtype=bool)
        self._abovecliff[1, 1] = True
        self.prob = 0.2





        # Calculate transition probabilities and rewards
        self.P = {}
        for s in range(self.nS):
            position = np.unravel_index(s, self.shape)
            self.P[s] = {a: [] for a in range(self.nA)}
            self.P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
            self.P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
            self.P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
            self.P[s][LEFT] = self._calculate_transition_prob(position, [0, -1])




        # Calculate initial state distribution
        # We always start in state (3, 0)
        if self.init_mode == 'original':
            self.initial_state_distrib = np.zeros(self.nS)
            self.initial_state_distrib[self.start_state_index] = 1.0
        
        elif self.init_mode == 'uniform':
            self.initial_state_distrib = np.ones(self.nS) / (self.nS-1)
            self.initial_state_distrib[-1] = 0
        
        else:
            raise ValueError('Invalid Initial Mode')



        
        ## For testing purpose
        self.initial_state_distrib_test = np.zeros(self.nS)
        self.initial_state_distrib_test[self.start_state_index] = 1.0
        self.observation_space = spaces.Discrete(self.nS)
        self.action_space = spaces.Discrete(self.nA)
        self.render_mode = render_mode



        
        # pygame utils
        self.cell_size = (60, 60)
        self.window_size = (
            self.shape[1] * self.cell_size[1],
            self.shape[0] * self.cell_size[0],
        )
        self.window_surface = None
        self.clock = None
        self.elf_images = None
        self.start_img = None
        self.goal_img = None
        self.cliff_img = None
        self.mountain_bg_img = None
        self.near_cliff_img = None
        self.tree_img = None




    
    def _limit_coordinates(self, coord: np.ndarray) -> np.ndarray:
        """Prevent the agent from falling out of the grid world."""
        coord[0] = min(coord[0], self.shape[0] - 1)
        coord[0] = max(coord[0], 0)
        coord[1] = min(coord[1], self.shape[1] - 1)
        coord[1] = max(coord[1], 0)
        return coord




    
    def _calculate_transition_prob(self, current, delta):
        """Determine the outcome for an action. Transition Prob is always 1.0.
        Args:
            current: Current position on the grid as (row, col)
            delta: Change in position for transition
        Returns:
            Tuple of ``(1.0, new_state, reward, terminated)``
        """
        new_position = np.array(current) + np.array(delta)
        new_position = self._limit_coordinates(new_position).astype(int)
        new_state = np.ravel_multi_index(tuple(new_position), self.shape)
        
        # if fall off the cliff, go back to the start_state_index
        if self._cliff[tuple(new_position)]:
            return [(1.0, self.start_state_index, 30.0, False)]
        
        # if above the cliff, with some probability, it falls off or safe
        elif self._abovecliff[tuple(new_position)]:
            return [(1 - self.prob, new_state, 10.0, False), (self.prob, self.start_state_index, 30.0, False)]

        # hit the wall
        elif tuple(new_position) == current:
            return [(1.0, new_state, 15.0, False)]
        

        terminal_state = (self.shape[0] - 1, self.shape[1] - 1)
        is_terminated = tuple(new_position) == terminal_state
        

        return [(1.0, new_state, 10.0, is_terminated)]




    
    def step(self, a):
        transitions = self.P[self.s][a]
        i = np.random.choice(range(len(transitions)), 1, replace=True, p=[t[0] for t in transitions])[0]
        p, s, r, t = transitions[i]
        self.s = s
        self.lastaction = a

        if self.render_mode == "human":
            self.render()
        
        return (int(s), r, t, False, {"prob": p})





    
    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)
        # original
        # self.s = categorical_sample(self.initial_state_distrib, self.np_random)

        self.s = np.random.choice(range(self.nS), 1, replace=True, p=self.initial_state_distrib)[0]
        self.lastaction = None

        if self.render_mode == "human":
            self.render()
        return int(self.s), {"prob": 1}





    
    def reset_test(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)
        self.s = categorical_sample(self.initial_state_distrib_test, self.np_random)
        self.lastaction = None

        if self.render_mode == "human":
            self.render()
        return int(self.s), {"prob": 1}

In [3]:
@jit(nopython=True)
def softmax_policy(s, theta):
    """ Given a state, compute probability of each action """

    """ 
    Arguments
        s[integer]: Index of state
        theta[np.ndarray]: Policy parameter of shape (n_actions, n_states)

    Returns:
        The probability of each action at state s -> (n_action, )
    """
    s = int(s)
    theta_exp = np.exp(theta[:, s])
    return theta_exp / np.sum(theta_exp)

In [4]:
@jit(nopython=True)
def log_softmax_policy_grad(s, a, theta):
    """
    Compute the gradient of log softmax policy with respect to (a, s) [theta_{a, s}]
    """
    s = int(s)
    grad = np.zeros_like(theta)
    probs = softmax_policy(s, theta)
    
    # only consider s-th column, all rows are -\pi(i|s)
    grad[:, s] = -probs
    
    # Consider a-th row, there will be additional +1
    grad[a, s] = grad[a, s] + 1
    
    return grad

In [5]:
@jit(nopython=True)
def update_state_action_distribution(state_action_dist,  # previous distribution
                                     state, action, reward, next_state, next_action,  # Give a SARSA pair
                                     supports, gamma):
    """ Given a SARSA pair, update the state-action distribution under Bellman equation """

    vmin, vmax = supports[0], supports[-1]
    n_support = len(supports)                
    dz = (vmax - vmin) / (n_support - 1)

    probs = np.zeros(len(supports)) + 1e-16

    next_dist = state_action_dist[next_state, next_action]
    
    
    shifted_supports = reward + gamma * supports
    
    for j in range(n_support):
        
        if shifted_supports[j] < vmin:
            probs[0] += next_dist[j]
        
        elif shifted_supports[j] > vmax:
            probs[-1] += next_dist[j]
        
        else:
            b = (shifted_supports[j] - vmin) / dz
            l = int(np.floor(b))
            u = int(np.ceil(b))

            probs[l] += next_dist[j] * (u + (l == u) - b) # incase b is an integer, b == u == l
            probs[u] += next_dist[j] * (b - l)
    
    
    probs = probs / np.sum(probs)
    state_action_dist[state, action] = 0.9 * state_action_dist[state, action] + 0.1 * probs

    return state_action_dist

In [6]:
def policy_evaluation(vmin, vmax, n_support, gamma,
                      policy, theta, shape,
                      episode, steps):

    supports = np.linspace(vmin, vmax, n_support)
    dz = (vmax - vmin) / (n_support - 1)
    n_state = shape[0] * shape[1]
    n_action = 4

    state_action_dist = np.ones((n_state, n_action, n_support)) / n_support
    state_action_dist[-1, :, 0] = 1.0      # last state, all actions, first cost
    state_action_dist[-1, :, 1:] = 0.0     # other points are zeros

    trajectorys = []

    # Start Several Games for Policy Evaluation
    for _ in range(episode):

        trajectory = []
        
        train_env = CliffWalkingEnv(init_mode='original')
        
        state, _ = train_env.reset()
        trajectory.append(state)
        
        for step in range(steps):
            
            action = np.random.choice(range(n_action), 1, replace=True, p=policy(state, theta))[0]
            
            next_state, reward, done, _, _ = train_env.step(action)

            trajectory.append(action)
            trajectory.append(reward)
            trajectory.append(next_state)
            
            
            next_action = np.random.choice(range(n_action), 1, replace=True, p=policy(next_state, theta))[0]
            
            state_action_dist = update_state_action_distribution(state_action_dist, 
                                                                 state, action, reward, next_state, next_action,
                                                                 supports, gamma)
            state = next_state
            
            if done:
                trajectorys.append(np.array(trajectory))
                break
        
        trajectorys.append(np.array(trajectory))
    
    return state_action_dist, trajectorys

In [7]:
@jit(nopython=True)
def state_value_distribution(state_action_dist, s, policy, theta):
    """ Given a state-action distribution [state, action, supports], 
        compute the state value distribution at target state 's' """
    # state action distribution: [state x action x support]
    s = int(s)
    
    # action x support
    dist = state_action_dist[s]  # retrieve each action distribution at target state s
    
    # action x 1
    probs = policy(s, theta).reshape(-1, 1)  # Compute the probability of each action 
    
    # action x support
    state_prob = np.sum(dist * probs, axis=0)  # Action probability weighted average
    
    return state_prob / np.sum(state_prob)  # normalization

In [8]:
def sample_trajectory(env, policy, theta, path_length):
    """ Sample a trajectory looks like (s_0, a_0, r_0, s_1, a_1, r_1, ..., s_T) """
    trajectory = []
    state, _ = env.reset()
    trajectory.append(state)
    while len(trajectory) <= 3 * path_length:
        action = np.random.choice(range(env.action_space.n), 1, replace=True, p=policy(state, theta))[0]
        trajectory.append(action)
        state, reward, done, truncated, info = env.step(action)
        trajectory.append(reward)
        trajectory.append(state)
        
        if done:
            break
    
    return np.array(trajectory)

In [9]:
@jit(nopython=True)
def pushforward_one_measure_one_pair(measure, r, gamma, supports):
    """ Given a measure (measure values at discrete points), 
        by receiving one reward, 
        conduct one-time pushforward with projection """
    
    vmin, vmax = supports[0], supports[-1]
    n_support = len(supports)
    dz = (vmax - vmin) / (n_support - 1)

    weights = np.zeros(len(supports)) # Initialization
    shifted_supports = r + gamma * supports

    for j in range(n_support):
        
        if shifted_supports[j] < vmin:
            weights[0] += measure[j]
        
        elif shifted_supports[j] > vmax:
            weights[-1] += measure[j]

        else:
            b = (shifted_supports[j] - vmin) / dz
            l = int(np.floor(b))
            u = int(np.ceil(b))
    
            weights[l] += measure[j] * (u + (l == u) - b) # In case b = l = u is an integer
            weights[u] += measure[j] * (b - l)

    return weights

In [10]:
@jit(nopython=True)
def pushforward_all_measure_one_pair(measures, r, gamma, supports):
    """ Given a whole measure (state, num_supports), 
        push forward all single measures inside """
    n_state, n_action, n_sup = measures.shape
    for s in range(n_state):
        for a in range(n_action):
            measures[s, a] = pushforward_one_measure_one_pair(measures[s, a], r, gamma, supports)
    return measures

In [11]:
@jit(nopython=True)
def pushforward_one_trajectory(measures, trajectory, gamma, supports):
    """ Given a trajectory, Pushforward the whole measure along the trajectory """
    """ The trajectory does not contain the final state """
    # if length of trajectory is 0, then no pushforward operator is required
    if len(trajectory) != 0:
        # (s1, a1, g^0 * r1, s2, a2, g^1 * r2)
        # (r2, a2, s2, r1, a1, s1) -> (g^1 * r2, r1)
        trajectory_copy = np.copy(trajectory)

        # Discount Reward
        for i in range(len(trajectory_copy) // 3):
            trajectory_copy[i*3+2] = trajectory_copy[i*3+2] * gamma ** 0

        # Backward Pushforward
        for r in trajectory_copy[::-1][0:len(trajectory_copy):3]:
            measures = pushforward_all_measure_one_pair(measures, r, gamma, supports)
    
    return measures

In [12]:
@jit(nopython=True)
def state_value_dist_grad(state_action_dist, trajectory, policy, log_policy_grad, theta, gamma, supports):
    """ Given a trajectory, estimate the state value distribution gradient """

    # trajectory = np.array(trajectory)
    # Step 1: Compute the log-policy-gradient weighted state value distribution
    log_grad_weight_state_dist = []
    for s in range(9):
        res = np.zeros((4, 9, len(supports)))
        
        for a in range(4):
            prob = policy(s, theta)[a]
            res = res + prob * log_policy_grad(s, a, theta)[:, :, np.newaxis] * state_action_dist[s, a].reshape(1, 1, -1) # broadcast
        log_grad_weight_state_dist.append(res)
    
    
    
    # Step 2: Given a trajectory, Compute the gradient
    res = np.zeros((4, 9, len(supports)), dtype=np.float64)

    # For each time step, the trajectory should be like: (s_0),(s_0, a_0, r_0, s_1), (s_0, a_0, r_0, s_1, a_1, r_1, s_2) ....
    for i in range((len(trajectory)-1) // 3 + 1):
        sub_trajectory = trajectory[:3*i+1]
        s = int(sub_trajectory[-1]) # The target initial distribution before pushforwards
        # the input trajectory should not include the final state, a.k.a, (), (s_0, a_0, r_0), (s_0, a_0, r_0, s_1, a_1, r_1)
        res = res + pushforward_one_trajectory(log_grad_weight_state_dist[s], sub_trajectory[:-1], gamma, supports)
    
    return res

In [13]:
def CVaR_policy_gradient(state_action_dist, trajectory,
                         policy, log_policy_grad, theta, gamma, 
                         supports, alpha):

    # Step 1: Compute the distribution at initial state
    init_state = int(trajectory[0])
    state_value_dist = state_value_distribution(state_action_dist, init_state, policy, theta)

    # Step 2: Compute the alpha-quantile and filter out tail supports
    if_tail = np.cumsum(state_value_dist) > alpha
    q_alpha = supports[if_tail][0]
    
    tail_supports = supports[if_tail]
    tail_supports_idx = np.where(if_tail)[0]
    tail_prob = state_value_dist[if_tail]
    
    # Step 3: Compute the gradient: 4 x 9 x n_support
    sa_grad = state_value_dist_grad(state_action_dist, trajectory, policy, log_policy_grad, theta, gamma, supports)
    
    # Step 4: Compute CVaR policy gradient
    cvar_grad = np.zeros_like(theta)
    for i in range(len(tail_supports_idx)):
        cvar_grad = cvar_grad + sa_grad[:, :, i] * (tail_supports[i] - q_alpha)
    
    cvar_grad = cvar_grad / (1 - alpha)
    
    return cvar_grad

In [14]:
def Expectation_policy_gradient(state_action_dist, trajectory, 
                                policy, log_policy_grad, theta, gamma, 
                                supports):
    # Step 1: Compute the distribution at initial state
    init_state = int(trajectory[0])
    state_value_dist = state_value_distribution(state_action_dist, init_state, policy, theta)

    # Step 2: Compute the gradient: 4 x 9 x n_support
    sa_grad = state_value_dist_grad(state_action_dist, trajectory, policy, log_policy_grad, theta, gamma, supports)

    # Step 3: Compute Expectation Policy Gradient
    grad = np.zeros_like(theta)
    for i in range(len(supports)):
        grad = grad + sa_grad[:, :, i] * supports[i]

    return grad

In [15]:
def sample_based_CVaR_gradient(trajectorys, policy, log_policy_grad, theta, gamma, alpha):
    """
    trajectorys: a list of trajectorys -> must be sampled at the same time
    """
    ## Compute Value Distributions
    z_ls = []
    for trajectory in trajectorys:
        z = np.array(trajectory[2::3])
        # print(z)
        discount = np.array([gamma ** i for i in range(len(z))])
        # print(discount)
        z_ls.append(np.sum(discount * z))
    # print(z_ls)
    ## alpha-quantile
    q_alpha = np.quantile(z_ls, alpha, method='inverted_cdf')
    # print(q_alpha)
    
    ## Compute the trajectory probability
    def log_prob_traj(trajectory):
        res = 0
        sub_trajectory = trajectory[:-1]
        s_ls = sub_trajectory[::3]
        a_ls = sub_trajectory[1::3]
        for s, a in zip(s_ls, a_ls):
            res += log_policy_grad(s, int(a), theta)
        return res
    
    # Compute the gradient
    grad = 0
    for i in range(len(trajectorys)):
        if z_ls[i] > q_alpha:
            res = log_prob_traj(trajectorys[i]) * (z_ls[i] - q_alpha)
            # print(res)
            grad = grad + res / len(trajectorys)

    return grad

In [79]:
def train(theta, lr, alpha=0.9, episode=5000, 
          vmin=0, vmax=2000, n_support=51, 
          gamma=0.95, policy=softmax_policy, log_policy_grad=log_softmax_policy_grad, 
          steps = 100, path_length=50, 
          num_train=50, shape=(3, 3)):
    
    
    supports = np.linspace(vmin, vmax, n_support)
    theta_ls = [np.copy(theta)]
    grad_ls = []
    
    for n_train in tqdm(range(num_train)):
        ###################### Test Starts ######################
        total_cost = 0
        path = []
        env = CliffWalkingEnv(init_mode='original')
        env.reset()
        state, _ = env.reset_test()
        for n_step in range(10):
            path.append(state)
            action = np.argmax(policy(state, theta))
            state, reward, done, truncated, info = env.step(action)
            total_cost += reward
            if done:
                path.append(state)
                break
        
        if (n_train+1) % 2 == 0:
            print('Verbose One Episode Policy: \n')
            print(f'State 6: {policy(6, theta)}')
            print(f'State 3: {policy(3, theta)}')
            print(f'State 0: {policy(0, theta)}')
            print(f'State 1: {policy(1, theta)}')
            print(f'State 2: {policy(2, theta)}')
            print(f'State 4: {policy(4, theta)}')
            print(f'State 5: {policy(5, theta)}')
        
            if done:
                print(f'Path is {path}, Total Cost is: {total_cost}, The Goal is Reached \n')
            
            else:
                print(f'Path is {path}, Total Cost is: {total_cost}, The Goal is NOT Reached \n')

        env.close()
        ###################### Test Ends ######################
        
        
        
        
        ###################### Policy Evaluation Starts ######################
        state_action_dist, trajectorys = policy_evaluation(vmin, vmax, n_support, gamma, 
                                              policy, theta, shape, 
                                              episode, steps)
        ###################### Policy Evaluation Ends ######################

        
        
        
        
        
        
        ###################### Policy Improvement Starts ######################
        grad = 0
        # trajectorys = [sample_trajectory(CliffWalkingEnv(init_mode='original'), policy, theta, path_length) for _ in range(n_trajectory)]
        for trajectory in trajectorys:
            grad += CVaR_policy_gradient(state_action_dist, trajectory, 
                                        policy, log_policy_grad, theta, gamma, 
                                        supports, alpha)
        grad /= len(trajectorys)
        theta -= lr * (grad / np.linalg.norm(grad))   ## 归一化, for a fair comparison
        ###################### Policy Improvement Ends ######################
        
        
        # Storage
        theta_ls.append(np.copy(theta))
        grad_ls.append(grad)
        print(np.linalg.norm(grad))
    
    return theta, theta_ls, grad_ls

In [95]:
def sampled_based_train(theta, lr, alpha=0.9, episode=5000, 
                        gamma=0.95, policy=softmax_policy, log_policy_grad=log_softmax_policy_grad, 
                        path_length=50, num_train=50, shape=(3, 3)):

    
    theta_ls = [np.copy(theta)]
    grad_ls = []
    for n_train in tqdm(range(num_train)):

        ################################# Test Cost #################################
        total_cost = 0
        path = []
        env = CliffWalkingEnv(init_mode='original')
        env.reset()
        state, _ = env.reset_test()
        for n_step in range(10):
            path.append(state)
            action = np.argmax(policy(state, theta))
            state, reward, done, truncated, info = env.step(action)
            total_cost += reward
            if done:
                path.append(state)
                break
        
        if (n_train+1) % 2 == 0:
            print('Verbose One Episode Policy: \n')
            print(f'State 6: {policy(6, theta)}')
            print(f'State 3: {policy(3, theta)}')
            print(f'State 0: {policy(0, theta)}')
            print(f'State 1: {policy(1, theta)}')
            print(f'State 2: {policy(2, theta)}')
            print(f'State 4: {policy(4, theta)}')
            print(f'State 5: {policy(5, theta)}')
        
            if done:
                print(f'Path is {path}, Total Cost is: {total_cost}, The Goal is Reached \n')
            
            else:
                print(f'Path is {path}, Total Cost is: {total_cost}, The Goal is NOT Reached \n')

        env.close()

        
        ################################# Policy Evaluation #################################
        trajectorys = [sample_trajectory(CliffWalkingEnv(init_mode='original'), policy, theta, path_length) for _ in range(episode)]
        grad = sample_based_CVaR_gradient(trajectorys, policy, log_policy_grad, theta, gamma, alpha)
        if np.linalg.norm(grad) > 0:
            theta -= lr * (grad / np.linalg.norm(grad))
        else:
            theta -= lr * grad
        
        print(np.linalg.norm(grad))
        theta_ls.append(np.copy(theta))
        grad_ls.append(grad)
    
    return theta, theta_ls, grad_ls

In [197]:
np.random.seed(49)
THETA = np.random.uniform(-5, 5, (4, 9)) + 1.0

# CDPG

## 100 trajectorys

In [None]:
our_100_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=100, num_train=100)
    our_100_theta_formal.append(theta_ls)

## 150 trajectorys

In [None]:
our_150_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=150, num_train=100)
    our_150_theta_formal.append(theta_ls)

## 200 trajectorys

In [None]:
our_200_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=200, num_train=100)
    our_200_theta_formal.append(theta_ls)

## 500 trajectorys

In [None]:
our_500_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=500, num_train=100)
    our_500_theta_formal.append(theta_ls)

## 1000 trajectorys

In [None]:
our_1000_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=1000, num_train=100)
    our_1000_theta_formal.append(theta_ls)

## 2000 trajectorys

In [None]:
our_2000_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=2000, num_train=100)
    our_2000_theta_formal.append(theta_ls)

## 3000 trajectorys

In [None]:
our_3000_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=3000, num_train=100)
    our_3000_theta_formal.append(theta_ls)

## 5000 trajectorys

In [None]:
our_5000_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=5000, num_train=100)
    our_5000_theta_formal.append(theta_ls)

# SPG

## 100 trajectorys

In [None]:
other_100_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = sampled_based_train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=100, num_train=100)
    other_100_theta_formal.append(theta_ls)

## 150 trajectorys

In [None]:
other_150_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = sampled_based_train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=150, num_train=100)
    other_150_theta_formal.append(theta_ls)

## 200 trajectorys

In [None]:
other_200_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = sampled_based_train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=200, num_train=100)
    other_200_theta_formal.append(theta_ls)

## 500 trajectorys

In [None]:
other_500_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = sampled_based_train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=500, num_train=100)
    other_500_theta_formal.append(theta_ls)

## 1000 trajectorys

In [None]:
other_1000_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = sampled_based_train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=1000, num_train=100)
    other_1000_theta_formal.append(theta_ls)

## 2000 trajectorys

In [None]:
other_2000_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = sampled_based_train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=2000, num_train=100)
    other_2000_theta_formal.append(theta_ls)

## 3000 trajectorys

In [None]:
other_3000_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = sampled_based_train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=3000, num_train=100)
    other_3000_theta_formal.append(theta_ls)

## 5000 trajectorys

In [None]:
other_5000_theta_formal = []

for num in range(5):
    print(f'Number Of Iteration:: {num}')
    theta, theta_ls, grad_ls = sampled_based_train(theta=np.copy(THETA), lr=0.2, alpha=0.9, episode=5000, num_train=100)
    other_5000_theta_formal.append(theta_ls)