In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install import-ipynb
import import_ipynb

In [None]:
%cd drive
%cd 'My Drive'
%cd 'RL'
%cd 'Homework'
%cd 'Homework 4'

In [None]:
'''Needed Libraries and Environment'''
import Environment
import Modified_Environment
import matplotlib.pyplot as plt
import numpy as np
import math
import random
from collections import defaultdict
import sys
import itertools
import matplotlib
import pandas as pd
import pickle

'''Constants Defined'''
PROBABILITY = 0
STATE = 1
ACTION = 2
REWARD = 3
DUMMY = None
MAX_T = 10000
TOTAL_EPISODES = 2000
gamma = 0.9
ALPHA = 0.2
MIN = 0.1
DECAY_FACTOR = 0.99
n = 3

In [9]:
class TreeBackupAgent():
  
    def __init__(self, env, gamma):
      self.env = env
      self.Q = defaultdict(lambda: np.zeros(env.action_space.n))
      self.actions = env.action_space.n
      self.gamma = gamma
      self.TB_episode_scores = []

    def epsilon_greedy_policy(self, state, epsilon):
        epsilon_policy = np.ones(self.actions, dtype=float) * epsilon / self.actions
        best_action = np.argmax(self.Q[state])
        epsilon_policy[best_action] += (1.0 - epsilon)
        return epsilon_policy


    def reach_island(self):
        epsilon = 1 #<-- uncomment if you want to use decaying epsilon 
        #epsilon = 0.3 #<-- uncomment if you want to use constant epsilon
        for episode in range(1, TOTAL_EPISODES + 1):
            episode_score = 0
            if episode % 2 == 0:
                print("\rEpisode {}/{}.".format(episode, TOTAL_EPISODES), end="")
                sys.stdout.flush()
                epsilon = max(epsilon * DECAY_FACTOR, MIN)#<-- uncomment if you want to use decaying epsilon
            state = self.env.reset()
            done = False
            for t in itertools.count():
                #first_step
                probs = self.epsilon_greedy_policy(state, epsilon)
                action = np.random.choice(np.arange(len(probs)), p = probs)
                prime_state, reward, _, _ = self.env.step(action)
                
                prime_probs = self.epsilon_greedy_policy(prime_state, epsilon)
                prime_action = np.random.choice(np.arange(len(prime_probs)), p = prime_probs )
                double_prime_state, prime_reward, _, _ = env.step(prime_action)
                #second_step
                double_prime_probs = self.epsilon_greedy_policy(double_prime_state, epsilon)
                double_prime_action = np.random.choice(np.arange(len(double_prime_probs)), p = double_prime_probs)
                triple_prime_state, double_prime_reward, done, _ = env.step(double_prime_action)
                #third_step
                triple_prime_probs  = self.epsilon_greedy_policy(triple_prime_state, epsilon)
                triple_prime_action = np.random.choice(np.arange(len(triple_prime_probs)), p = triple_prime_probs)
                information = [probs, state, action, reward]
                prime_information = [prime_probs, prime_state, prime_action, prime_reward]
                double_prime_information = [double_prime_probs, double_prime_state, double_prime_action, double_prime_reward]
                triple_prime_information = [triple_prime_probs, triple_prime_state]

                self.TB_returns(information, prime_information, double_prime_information, triple_prime_information)

                if done or t > MAX_T: break
                state = prime_state
                episode_score += reward
            self.TB_episode_scores.append(episode_score)

    def TB_returns(self, info, prime_info, double_prime_info, triple_prime_info):

        V = np.sum(prime_info[PROBABILITY] *self.Q[prime_info[STATE]])
        first_step = info[REWARD] + self.gamma * V

        prime_V = np.sum(double_prime_info[PROBABILITY] *self.Q[double_prime_info[STATE]])            
        first_step_error = prime_info[REWARD] + self.gamma * prime_V - self.Q[prime_info[STATE]][prime_info[ACTION]]
        prime_action_selection_prob = max(prime_info[PROBABILITY])            

        second_step = self.gamma * prime_action_selection_prob * first_step_error

        double_prime_V = np.sum(triple_prime_info[PROBABILITY] *self.Q[triple_prime_info[STATE]])
        second_step_error = double_prime_info[REWARD] + self.gamma * double_prime_V - self.Q[double_prime_info[STATE]][double_prime_info[ACTION]]
        double_prime_action_selection_prob = max(double_prime_info[PROBABILITY])

        third_step = self.gamma * prime_action_selection_prob * self.gamma * double_prime_action_selection_prob * second_step_error

        target = first_step + second_step + third_step 
        self.Q[info[STATE]][info[ACTION]] += ALPHA * (target - self.Q[info[STATE]][info[ACTION]])
        

In [10]:
# env =  Environment.GridworldEnv() #<--uncomment for normal environment
# env =  Modified_Environment.GridworldEnv() # <--uncomment for modified environment (bonus part)
agent = TreeBackupAgent(env, gamma)
agent.reach_island()

Episode 2000/2000.

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(TOTAL_EPISODES), agent.TB_episode_scores, color = '#633974')
plt.xlabel('episodes ->')
plt.ylabel('epsiode score ->')
plt.title('Tree Backup(3-step)')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
window_size = 50
smoothed_score = pd.Series(agent.TB_episode_scores).rolling(window_size , min_periods = window_size).mean()
plt.plot(smoothed_score, color = '#633974')
#plt.annotate(smoothed_score.iloc[-1], xy=(2000, smoothed_score.iloc[-1]), xytext=(2000, smoothed_score.iloc[-1]), color = '#633974',) # <--uncomment for modified environment (bonus part)
plt.xlabel("epsiode ->")
plt.ylabel("epsiode score (smoothed) -> ")
plt.title('Tree Backup(3-step)')
plt.show()

In [15]:
#i wrote my reward data into a .txt file to use it later.

# with open('3_TB_bonus_scores', 'wb') as fp:
#     pickle.dump(agent.TB_episode_scores, fp)

second version


In [None]:
# def create_behavior_policy(Q, nA, epsilon=0.3):
#     def policy_fn(observations):
#         A = np.ones(nA, dtype=np.float) * (epsilon/nA)
#         best_action = np.argmax(Q[observations])
#         A[best_action] += 1.0 - epsilon
#         return A
#     return policy_fn

# def create_target_policy(Q, nA, epsilon=0.1):
#     def policy_fn(observations):
#         A = np.ones(nA, dtype=np.float) * (epsilon/nA)
#         best_action = np.argmax(Q[observations])
#         A[best_action] += 1.0 - epsilon
#         return A
#     return policy_fn

In [None]:
# class nStepTreeBackupAgent():
#   def __init__(self, env, n, gamma):
#       self.env = env
#       self.n = n
#       self.Q = defaultdict(lambda: np.zeros(env.action_space.n))
#       self.actions = env.action_space.n
#       self.behavior_policy = create_behavior_policy(self.Q, self.actions)
#       self.target_policy = create_target_policy(self.Q, self.actions)
#       #self.epsilon = epsilon
#       self.gamma = gamma
#       self.nstep_TB_episode_scores = []


#   def reach_island(self):
#     epsilon = 1
#     for episode in range(1, TOTAL_EPISODES + 1):
#         episode_score = 0
#         if episode % 2 == 0:
#                 epsilon = max(epsilon * DECAY_FACTOR, MIN)
#                 print("\rEpisode {}/{}.".format(episode, TOTAL_EPISODES), end="")
#                 sys.stdout.flush()
                
#         stored_rewards, stored_states, stored_actions  = {}, {}, {}        
#         T, t, tau = sys.maxsize, -1, 0
      
#         state = env.reset()
#         probs = self.behavior_policy(state)
#         action = np.random.choice(np.arange(len(probs)), p = probs)

#         stored_states[0], stored_actions[0]  = state, action
           
#         while tau < (T-1):
#             t+=1
#             if t < T:
#                 state, reward, done, _ = env.step(action)
                
#                 stored_states[(t + 1) % (self.n + 1)], stored_rewards[(t + 1) % (self.n + 1)] = state, reward
   
#                 episode_score += reward  
                              
#                 if done or t > MAX_T: T = t+1
#                 else:
#                     action_probs = self.behavior_policy(state)
#                     action = np.random.choice(np.arange(self.actions), p=action_probs)
#                     stored_actions[(t + 1) % (self.n + 1)] = action
                    
#             tau = t - self.n + 1
#             information = [DUMMY, stored_states, stored_actions, stored_rewards]
#             self.nstep_TB_returns(information, t, tau, T)
        
#         self.nstep_TB_episode_scores.append(episode_score)


#   def nstep_TB_returns(self, information, t, tau, T):
#       if tau >= 0:
#           if (t + 1) >= T:
#               G = information[REWARD][T % (self.n+1)]
#           else:
#               s_t1 = information[STATE][(t + 1) % (self.n + 1)]
#               leaf_sum = np.sum([(self.target_policy(s_t1)[a])*self.Q[s_t1][a] for a in range(self.actions)])
#               G = information[REWARD][(t + 1) % (self.n + 1)] + self.gamma * leaf_sum
          
#           for k in range(min(t, T-1), tau, -1):
#               s_k, a_k = information[STATE][k % (self.n + 1)], information[ACTION][k % (self.n + 1)]
#               action_probs = np.sum([self.target_policy(s_k)[a]* self.Q[s_k][a] for a in range(self.actions) if a!= a_k])
#               G = information[REWARD][k % (self.n + 1)] + self.gamma * (action_probs + self.target_policy(s_k)[a_k]*G)
          
#           s_tau, a_tau =  information[STATE][tau % (self.n + 1)], information[ACTION][tau % (self.n + 1)]
#           self.Q[s_tau][a_tau] += ALPHA * (G - self.Q[s_tau][a_tau])


In [None]:

# agent = nStepTreeBackupAgent(env, n, gamma)
# agent.reach_island()

Episode 2000/2000.

In [None]:
# plt.figure(figsize=(12, 6))
# plt.plot(range(TOTAL_EPISODES), agent.nstep_TB_episode_scores, color = 'thistle', label = '$\epsilon$ = from $1$ to $0.1$')
# plt.xlabel('episodes ->')
# plt.ylabel('epsiode score ->')
# plt.title('Tree Backup(3-step)')
# plt.legend()
# plt.show()

In [None]:
# plt.figure(figsize=(12, 6))
# window_size = 50
# smoothed_score = pd.Series(agent.nstep_TB_episode_scores).rolling(window_size , min_periods = window_size).mean()
# plt.plot(smoothed_score, color = 'thistle',label = '$\epsilon$ = from $1$ to $0.1$')
# plt.xlabel("epsiode ->")
# plt.ylabel("epsiode score (smoothed) -> ")
# plt.title('Tree Backup(3-step)')
# plt.legend()
# plt.show()