In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle

from Environments.Stochastic_GridWorld8Actions import GridWorld

In [None]:
def smoothing_data(rewards_across_seeds, traps_across_seeds, variance_across_seeds):

    kernel_size = 11000
    kernel = np.ones(kernel_size) / kernel_size

    all_rewards = []
    for data in rewards_across_seeds:
        rewards = np.convolve(data, kernel)
        all_rewards.append(rewards[kernel_size:-kernel_size])

    all_traps = []
    for data in traps_across_seeds:
        traps = data
        all_traps.append(traps)


    return all_rewards, all_traps


In [None]:
class Table_Q_Learning():
    def __init__(self, action_space = 8, state_space = 100, lr = 0.0001, gamma = 0.99, epsilon = 0.91, 
                 annealing_coefficient =  0.9999999, beta = 0, seed = 0):
        self.rng = np.random.default_rng( seed )
        
        self.action_space = action_space
        self.state_space = state_space

        self.possible_actions = np.arange(action_space)

        self.lr = lr #learning rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.annealing_coefficient = annealing_coefficient
        
        self.beta = beta

        self.q_table = [
                        [0.0] * action_space
                        for _ in range(state_space)
                        ] 

        self.pi_table = [
                        [1.0/action_space] * action_space
                        for _ in range(state_space)
                        ]
                    
    def update(self, current_state, action, next_state, reward, done, env_obj):    
        
        max_next_q = np.max(self.q_table[next_state])
        n_max_next_q = 0
        for q in self.q_table[next_state]:
            if(q == max_next_q):
                n_max_next_q +=1

        non_greedy_actions_prob = self.epsilon / self.action_space
        greedy_action_prob = (1-self.epsilon)/n_max_next_q + non_greedy_actions_prob

        expected_q = 0

        for i in range(self.action_space):
            if(self.q_table[next_state][i] == max_next_q):
                expected_q += greedy_action_prob * self.q_table[next_state][i]
            else:
                expected_q += non_greedy_actions_prob * self.q_table[next_state][i]  

        td_error = reward + self.gamma * expected_q * (1-done) - self.q_table[current_state][action]
  
        k = self.beta

        k_plus = (1 - k)
        k_minus = (1 + k)


        if td_error >= 0:
            td_error = k_plus * np.tanh(1/4 * td_error )
        else:
            td_error = k_minus * np.tanh(1/4 * td_error )
        
        self.q_table[current_state][action] +=  self.lr * td_error

        #annealing epsilon
        if self.epsilon > 0.1:
            self.epsilon *= self.annealing_coefficient

    def take_action(self, current_state, epsilon = None):
        ep_policy = 0
        if epsilon == None:
            ep_policy = self.epsilon
        else:
            ep_policy = epsilon

        if(self.rng.random() < ep_policy):
            random_possible_action = self.rng.choice(self.possible_actions)

            q_action = self.q_table[current_state][random_possible_action]

            return random_possible_action, q_action, self.q_table[current_state]
        else:
            max_action = np.argmax(self.q_table[current_state])
            max_q_action = np.max(self.q_table[current_state])

            return max_action, max_q_action, self.q_table[current_state]

    def set_RDG_seed(self, seed):
        self.rng = np.random.default_rng(seed)
    


In [4]:
transitions_variance = []

file = open('test_transitions_for_variance_1.0.pickle', 'rb')
# dump information to that file
transition = pickle.load(file)
transitions_variance.append(transition)
# close the file
file.close()

# file = open('test_transitions_for_variance_0.3.pickle', 'rb')
# # dump information to that file
# transition = pickle.load(file)
# transitions_variance.append(transition)
# # close the file
# file.close()

# file = open('test_transitions_for_variance_0.5.pickle', 'rb')
# # dump information to that file
# transition = pickle.load(file)
# transitions_variance.append(transition)
# # close the file
# file.close()

# file = open('test_transitions_for_variance_0.7.pickle', 'rb')
# # dump information to that file
# transition = pickle.load(file)
# transitions_variance.append(transition)
# # close the file
# file.close()

In [8]:
for beta_data in [0,0.2,0.3,0.4,0.6,0.8]:
    env_obj = GridWorld(x_dim = 7, y_dim = 7, deterministic_T_prob = 1.0, reward_location=42) 
    env_obj.state_transitions = transitions_variance[0]

    # env_obj.set_penalty(2, 1)
    # env_obj.set_penalty(4, 3)
    # env_obj.set_penalty(3, 4)

    env_obj.set_penalty(2, 1)
    env_obj.set_penalty(4, 3)
    env_obj.set_penalty(3, 4)


    traps_across_seeds = []
    rewards_across_seeds = []
    std_across_seeds = []

    trajectory_heat_map_arr = []
    q_values_across_seeds = []

    for seed in range(4):

        EPISODES = 800000
        
        traps = np.zeros(EPISODES)
        rewards_arr_episodes = []
        std_per_episode = []

        agent_obj = Table_Q_Learning(lr = 0.01, state_space=49, annealing_coefficient =  0.999999,
                                      beta = beta_data, seed=seed)
        trajectory_heat_map = np.zeros((7,7))
        q_values_arr = []
        
        for i in range(EPISODES): #episodes
        
            state = env_obj.reset()[0]
            mean_episodic_reward = 0
            #variance_episodic_reward = 0
            #std_episodic_reward = 0
            
            for j in range(1,5001): #steps    

                action, q, all_q = agent_obj.take_action( state ) 
                new_state, reward, done, _ = env_obj.step(state, action)
                new_state = new_state[0]
                
                tempx = (reward - mean_episodic_reward)
                mean_episodic_reward += tempx / j
                # This difference is with respect to the updated mean
                #squared_difference = (reward - mean_episodic_reward)**2
                #variance_episodic_reward = ((j-1)*variance_episodic_reward + squared_difference) / j
                #std_episodic_reward = np.sqrt(variance_episodic_reward)

                agent_obj.update(state, action, new_state, reward, done, env_obj)


               
                x, y = np.where(env_obj.grid == state)
                trajectory_heat_map[x[0]][y[0]] +=1
            
                
                if(done == 1):
                    
                    x, y = np.where(env_obj.grid == new_state)
                    trajectory_heat_map[x[0]][y[0]] +=1

                     
                    # if i % 1000 == 0 and i > 140000:
                    #     print("Episode ",i, " Epsilon ",agent_obj.epsilon, " mean reward ",mean_episodic_reward)
                    if(env_obj.state_attributes[new_state] == 2):
                        traps[i] = 1
                    break

                state = new_state
        

            rewards_arr_episodes.append(mean_episodic_reward)
            #q_values_arr.append(agent_obj.q_table)
            #std_per_episode.append(std_episodic_reward)

        traps_across_seeds.append(traps)
        rewards_across_seeds.append(rewards_arr_episodes)
        std_across_seeds.append(std_per_episode)
        trajectory_heat_map_arr.append(trajectory_heat_map)
        q_values_across_seeds.append(agent_obj.q_table)

    all_rewards, all_traps = smoothing_data(rewards_across_seeds, traps_across_seeds, std_across_seeds)
    # with open('Prospect_framework_reward_zeta_test_'+str(zeta_data)+'.pickle', 'wb') as file:
    #     pickle.dump(all_rewards, file)

    with open('Prospect_framework_traps_zeta_test_'+str(beta_data)+'.pickle', 'wb') as file:
        pickle.dump(all_traps, file)

    # # with open('Prospect_framework_variance_'+determinism+'.pickle', 'wb') as file:
    # #     pickle.dump(all_variance_rewards, file)

    with open('Prospect_framework_heatmap_beta_test_'+str(beta_data)+'.pickle', 'wb') as file:
        pickle.dump(trajectory_heat_map_arr, file)
    
    with open('Prospect_framework_q_value_beta_test_'+str(beta_data)+'.pickle', 'wb') as file:
        pickle.dump(q_values_across_seeds, file)
    
