On-policy MC control
----------------------
---------------------

In [1]:
def onpolicy_control(env, nS, nA, num_episodes, gamma, epsilon):
    # initialize action-value function
    Q = np.zeros([nS,nA])

    # list of possible actions to be taken by the agent
    A = [i for i in range(nA)]

    # initialize list to save state-action returns for each epsisode
    returns = [[[] for _ in range(nA)] for _ in range(nS)]

    # initialize random policy
    policy = random_policy(nS,nA)

    average_reward = evaluate_average_reward(env, nA, policy, num_eval_episodes = 100)
    
    # store number of policy iterations and average episode reward during training
    num_policy_iter = [0]
    average_rewards_lists = [average_reward]

    # number of policy iterations
    n = 0

    # loop over number of episodes of experience sampled by agent 
    for _ in range(num_episodes):
        n+=1
        # initialize initial state value
        state = 0 
        
        # save states, actions and rewards for each episode
        states = [0]
        actions = []
        rewards = []

        # loop until episode terminates or truncates 
        while True:
            # epislon greedy policy
            policy = epsilon_greedy_policy(Q, nS, nA, epsilon = 0.1)

            # pick action according to the current policy
            action = random.choices(A, weights=[policy[state][i] for i in range(nA)], k=1)[0]

            # take action
            next_state, reward, terminated, truncated, info = env.step(action)
            
            if terminated == False and truncated == False:
                # save state, action and reward in episode
                states.append(next_state)
                actions.append(action)
                rewards.append(reward)


                # update current state
                state = next_state

            # if epsiode terminates or truncates, reset episode and exit loop
            if terminated == True or truncated == True:
                actions.append(action)
                rewards.append(reward)

                env.reset()

                break
        
        # initialize return value: the total discounted award reward from a complete episode
        G = 0

        episode_length = len(states)

        state_action = [[states[i],actions[i]] for i in range(episode_length)]
        
        # evaluate G for each state in episode
        # as return is a function of all future rewards of an episode,loop backwards in time
        for i in reversed(range(episode_length)):
            # return at the current state-action pair is equal to the sum of the immediate reward plus the future discounted return
            G = gamma*G + rewards[i]

            # first-visit MC averages averages over returns from first state-action pair in an episode
            # ignore returns from state-action pairs after first time in epsiode
            if state_action[i] not in state_action[:i]:
                returns[states[i]][actions[i]].append(G)
                # evaluate state-action-value by taking the average of the state-action returns across all episodes of experience
                Q[states[i],actions[i]]= np.mean(returns[states[i]][actions[i]])
            
        policy = epsilon_greedy_policy(Q, nS, nA, epsilon)
        policy_greedy = greedy_policy(Q, nS, nA)

        # save number of policy iterations
        num_policy_iter.append(n)
        # evaluate average episode reward
        average_reward = evaluate_average_reward(env, nA, policy, num_eval_episodes = 100)
        
        average_rewards_lists.append(average_reward)

    return Q, policy, num_policy_iter, average_rewards_lists

Off-policy MC control
----------------------
---------------------

In [3]:
def offpolicy_control(env, nS, nA, num_episodes, gamma, epsilon):

    # initialize action-value function
    Q = np.zeros([nS,nA])

    # list of possible actions to be taken by the agent
    A = [i for i in range(nA)]
    
    # cumulative sum of weights, W, for a given state-action value
    C = np.zeros([nS,nA])

    # initialize random policy
    b_policy = epsilon_greedy_policy(Q, nS, nA, epsilon)

    average_reward = evaluate_average_reward(env, nA, b_policy, num_eval_episodes = 100)
    
    # store number of policy iterations and average episode reward during training
    num_policy_iter = [0]
    average_rewards_list_t = [average_reward]
    average_rewards_list_b = [average_reward]

    # number of policy iterations
    n = 0
    
    for _ in range(num_episodes):
        n+=1
        # initialize initial state value
        state = 0

        # save states, actions and rewards for each episode
        states = [0]
        actions = []
        rewards = []

        # loop until episode terminates or truncates 
        while True:
            b_policy = epsilon_greedy_policy(Q, nS, nA, epsilon)
            
            # pick action according to the current policy
            action = random.choices(A, weights=[b_policy[state][i] for i in range(nA)], k=1)[0]

            # take action
            next_state, reward, terminated, truncated, info = env.step(action)

            if terminated == False and truncated == False:
                # save state, action and reward in episode
                states.append(next_state)
                actions.append(action)
                rewards.append(reward)

                # update current state
                state = next_state

            # if epsiode terminates or truncates, reset episode and exit loop
            if terminated == True or truncated == True:
                actions.append(action)
                rewards.append(reward)
                
                env.reset()
                break
    
        # initialize return value
        G = 0

        # initialize importance sampling ratio
        W = 1

        episode_length = len(states)
    
        for i in reversed(range(episode_length)):

            # return at the current state-action pair is equal to the sum of the immediate reward plus the future discounted return
            G = gamma*G + rewards[i]
            
            # cumulative sum of weights for each state-action pair
            C[states[i],actions[i]] = C[states[i],actions[i]] + W

            # incremental update rule for Q
            Q[states[i],actions[i]]= Q[states[i],actions[i]] + ((W/C[states[i],actions[i]])* (G-Q[states[i],actions[i]]))

            t_policy = greedy_policy(Q,nS,nA)

            # if action taken is not the optimal action under the target policy
            if actions[i] != np.argmax(t_policy[states[i]]):
                # then t_policy(s,a) = 0 (as target policy is deterministic) and so update of W = 0
                break
            
            # if action taken is the optimial action, update W
            else:
                # where the probability the optimial action is taken under target policy is 1
                W = W * (1/b_policy[states[i]][actions[i]])

        average_reward_t = evaluate_average_reward(env, nA, t_policy, num_eval_episodes = 100)
        average_reward_b = evaluate_average_reward(env, nA, b_policy, num_eval_episodes = 100)

        average_rewards_list_t.append(average_reward_t)
        average_rewards_list_b.append(average_reward_b)

        num_policy_iter.append(n)
    
    return Q, t_policy, b_policy, num_policy_iter, average_rewards_list_t, average_rewards_list_b

SARSA
-----
------

In [11]:
def SARSA(env, nS, nA, num_episodes, alpha, gamma, epsilon):

    # initialize action-value function
    Q = np.zeros([nS,nA])

    # list of possible actions to be taken by the agent
    A = [i for i in range(nA)]

    # intialise epsilon-greedy policy
    policy = epsilon_greedy_policy(Q, nS, nA, epsilon)

    average_reward = evaluate_average_reward(env, nA, policy, num_eval_episodes = 100)
    
    # store number of policy iterations and average episode reward during training
    num_policy_iter = [0]
    average_rewards_list = [average_reward]
    
    # number of policy iterations
    n = 0
    
    # loop over number of episodes of experience sampled by agent 
    for _ in range(num_episodes):
        n+= 1

        # initialize starting state of episode
        state = 0 
        
        # Choose a random action to start
        action = env.action_space.sample() 

        # loop until episode terminates or truncates 
        while True:
            # take chosen action and observe next state and reward
            next_state, reward, terminated, truncated, info = env.step(action)

            # use the epsilon-greedy algorithm in order to select an action
            next_action = random.choices(A, weights=[policy[next_state][i] for i in range(nA)], k=1)[0]
            
            #update action-value function
            Q[state][action] = Q[state][action] + alpha * (reward + (gamma * Q[next_state][next_action])-Q[state][action])

            if terminated == False and truncated == False:
                # update current state
                state = next_state

                # update current action 
                action = next_action

                # update current policy
                policy = epsilon_greedy_policy(Q, nS, nA, epsilon)
            
            # if epsiode terminates or truncates, reset episode and exit loop
            if terminated == True or truncated == True:
                # save number of policy iterations
                num_policy_iter.append(n)
                # evaluate average episode reward
                average_reward = evaluate_average_reward(env, nA, policy, num_eval_episodes = 100)
                average_rewards_list.append(average_reward)
                
                env.reset()
                break 

    return Q, policy, num_policy_iter, average_rewards_list

Q-Learning
-----------
-----------

In [29]:
def Q_learning(env, nS, nA, num_episodes, alpha, gamma, epsilon):
    
    # initialize action-value function
    Q = np.zeros([nS,nA])
    
    # list of possible actions to be taken by the agent
    A = [a for a in range(nA)]

    # behaviour policy is epsilon greedy policy
    b_policy = epsilon_greedy_policy(Q, nS, nA, epsilon)

    average_reward = evaluate_average_reward(env, nA, b_policy, num_eval_episodes = 100)
    
    # store number of policy iterations and average episode reward during training
    num_policy_iter = [0]
    average_rewards_list_b = [average_reward]
    average_rewards_list_t = [average_reward]
    
    # number of policy iterations
    n = 0

    # loop over number of episodes of experience sampled by agent 
    for _ in range(num_episodes):
        n += 1
        # initialize state value
        state = 0

        # loop until episode terminates or truncates 
        while True:
            # pick action according to the current behaviour policy
            action = random.choices(A, weights=[policy[state][i] for i in range(nA)], k=1)[0]
            
            # take action
            next_state, reward, terminated, truncated, info = env.step(action)

            # update action-value function 
            Q[state][action] = Q[state][action] = Q[state][action] + alpha * (reward + (gamma * np.max(Q[next_state]))-Q[state][action])

            if terminated == False and truncated == False:
                # update current state
                state = next_state

                # update behaviour policy (epsilon-greedy policy)
                b_policy = epsilon_greedy_policy(Q, nS, nA, epsilon)
                # update target policy (greedy policy)
                t_policy = greedy_policy(Q, nS, nA)
            
            # if epsiode terminates or truncates, reset episode and exit loop
            if terminated == True or truncated == True:

                # save number of policy iterations
                num_policy_iter.append(n)
                # evaluate average episode reward
                average_reward_b = evaluate_average_reward(env, nA, b_policy, num_eval_episodes = 100)
                average_reward_t = evaluate_average_reward(env, nA, t_policy, num_eval_episodes = 100)
                
                average_rewards_list_b.append(average_reward_b)
                average_rewards_list_t.append(average_reward_t)

                env.reset()
                break 
        
    return Q, t_policy, b_policy, num_policy_iter, average_rewards_list_b, average_rewards_list_t