# MULTI-TASK DEEP REINFORCEMENT LEARNING

This is the python notebook file which houses all code related to the Multi-Task Deep Reinforcement Learning project in the 2022 Tripods/StemForAll REU program run through the University of Rochester. We have built different grid games and different agents to play these grid games.

In [None]:
import numpy as np
from PIL import Image, ImageDraw
import time
from random import sample
import numpy.random as random
import matplotlib.pyplot as plt

# GRIDWORLD

This class allows us to define a grid game and its basic characteristics such as the size of the board, where the agent starts and currently is during an episode, and what its reward matrix and possible moves are

In [None]:
class GridWorld:
    def __init__(self, board_dims, start_state, terminal_state, reward_matrix):
        self.board_dims = board_dims
        self.height, self.width = board_dims
        self.start_state = start_state
        self.agent_pos = start_state
        self.terminal_state = terminal_state
        self.reward_matrix = reward_matrix
        self.moves = [(-1,0),(0,-1),(1,0),(0,1)] #up,left,down,right
        
    def get_moves(self, point):
        """
        point - the point our agent is in
        This function obtains the points of possible moves you can potentially make from the point provided,
        return being the ending position of the move

        """
        return [tuple(np.array(point)+np.array(x)) for x in self.moves]

    def point_to_index(self, point):
        """
        point - the point we want to convert to index (for list of lists e.g.)
        This function obtains the index when you iterate across each column over each row of our game board
        """
        return point[0]*self.width + point[1]

    def index_to_point(self, index):
        """
        index - the index we want to convert to game square point
        This function obtains the index when you iterate across each column over each row of our game board
        """
        #(divisions, remainder)
        return (index // self.width, index % self.width)

    def get_reward(self,point):
        """
        point - a point in our grid
        This function returns the reward for our agent going to the state represented by point
        """
        return self.reward_matrix[point]
    
    def illegal_move(self,point):
        """
        point - a point that may or may not be in our grid
        This function returns true if the move we make takes us off the board and false if the move is legal
        """
        return True if ((point[0] < 0 or point[0] == self.height) or (point[1] < 0 or point[1] == self.width)) else False
        
    def print_board(self):
        print(self.reward_matrix)

# Individual Game Implementations

Here we define each game and have it inherit attributes from GridWorld object

## Targeting Game

The targeting game tasks consists of the agent starting in a different square from the "target" square. The goal of the game is to have the agent reach the target square in as few moves as possible

In [None]:
class Targeting_Game(GridWorld):
    def __init__(self, board_dims, start_state, terminal_state, reward_matrix):
        GridWorld.__init__(self,board_dims, start_state, terminal_state, reward_matrix)
        
    def draw(self):
        """
        agent_pos - tuple point representing agent's current position on game board 
        remaining_prizes - list of tuples representing remaining prizes and their positions on game board
        This function draws our environment
        """

        image = Image.new("RGB", (501, 501), "black")
        draw = ImageDraw.Draw(image)

        w = 500/(self.width)
        h = 500/(self.height)
        color = "white"

        #Draw Grid and Start/Stop Squares
        for i in range(self.height):
            for j in range(self.width):
                if(i == self.start_state[0] and j == self.start_state[1]):
                    color = "blue"
                elif(i == self.terminal_state[0] and j == self.terminal_state[1]):
                    color = "red"
                draw.rectangle(((0+j*w, 0+i*h),(w+j*w, h+i*h)), outline = "black", fill = color)
                color ="white"

        #Draw Agent
        draw.ellipse((self.agent_pos[1]*w + w/4, self.agent_pos[0]*h + h/4, 3*w/4 + self.agent_pos[1]*w, 3*h/4 + self.agent_pos[0]*h), fill="black")

        display(image)

    def update_state(self,new_pos, if_illegal):
      """
      new_pos - a point in the game grid that the agent has moved to
      This function updates the agent position for the GameGrid class variable.
      """
      if not if_illegal:
          self.agent_pos = new_pos

    def is_episode_terminal(self):
        """
        This function returns a boolean based on if the Game's current episode is finished.
        """
        return True if self.agent_pos == self.terminal_state else False

    def refresh_game(self):
        """
        This function refreshes the important features of the game that might have changed within an episode
        """
        self.agent_pos = self.start_state

## Collection Game

The collection game tasks consists of the agent starting in a different square from a set of prize squares. The goal of the game is to have the agent collect each prize in the prize squares in as few moves as possible (prizes being removed upon collection)

In [None]:
class Collection_Game(GridWorld):
    def __init__(self, board_dims, start_state, prize_states, reward_matrix, prize_value=0):
        GridWorld.__init__(self,board_dims, start_state, None, reward_matrix)
        self.prize_states = prize_states
        self.remaining_prize_states = list(prize_states)
        self.prize_value = prize_value
        for prize_state in prize_states:
            self.reward_matrix[prize_state] = prize_value
    
    def draw(self):
        """
        agent_pos - tuple point representing agent's current position on game board 
        remaining_prizes - list of tuples representing remaining prizes and their positions on game board
        This function draws our environment
        """

        image = Image.new("RGB", (501, 501), "black")
        draw = ImageDraw.Draw(image)

        w = 500/(self.width)
        h = 500/(self.height)
        color = "white"

        #Draw Grid and Start/Stop Squares
        for i in range(self.height):
            for j in range(self.width):
                if(i == self.start_state[0] and j == self.start_state[1]):
                    color = "blue"
                draw.rectangle(((0+j*w, 0+i*h),(w+j*w, h+i*h)), outline = "black", fill = color)
                color ="white"

        #Draw Agent
        draw.ellipse((self.agent_pos[1]*w + w/4, self.agent_pos[0]*h + h/4, 3*w/4 + self.agent_pos[1]*w, 3*h/4 + self.agent_pos[0]*h), fill="black")

        #Draw Prizes
        if len(self.remaining_prize_states) > 1:
            for x in self.remaining_prize_states:
                draw.rectangle(((x[1]*w + w/4, x[0]*h + h/4), (3*w/4+x[1]*w, 3*h/4+x[0]*h)), outline = "black", fill = "yellow")
        elif len(self.remaining_prize_states) == 1:
            remaining_prize = self.remaining_prize_states[0]
            draw.rectangle(((remaining_prize[1]*w + w/4, remaining_prize[0]*h + h/4), (3*w/4+remaining_prize[1]*w, 3*h/4+remaining_prize[0]*h)), outline = "black", fill = "yellow")
      
        display(image)

    def update_state(self,new_pos, if_illegal):
        """
        new_pos - a point in the game grid that the agent has moved to
        This function updates the agent position for the GameGrid class variable.
        """
        if not if_illegal:
            self.agent_pos = new_pos

        if self.agent_pos in self.remaining_prize_states:
            self.remove_prize(self.agent_pos)

    def remove_prize(self, prize_point):
        """
        prize_point - a point in the game grid that contained a prize
        This function removes the prize at the prize_point supplied from the remaining prizes. This function also
            updates the reward matrix accordingly.
        """
        #remove prize from remaining prizes
        self.remaining_prize_states.remove(prize_point)
        #adjust reward matrix to account for no prize at this prize_point for the rest of the episode
        self.reward_matrix[prize_point] = self.reward_matrix[self.start_state]
        #if there remains one prize, set that to be the terminal state for the episode
        if len(self.remaining_prize_states) == 1:
            self.terminal_state = self.remaining_prize_states[0]
        
    def is_episode_terminal(self):
        return True if len(self.remaining_prize_states) == 0 else False

    def refresh_game(self):
        """
        This function refreshes the game's agent position, the remaining prizes, the reward matrix, and terminal 
            state. This is used between each episode
        """
        self.agent_pos = self.start_state
        self.remaining_prize_states = list(self.prize_states)
        self.terminal_state = None
        
        for prize_state in self.prize_states:
            self.reward_matrix[prize_state] = self.prize_value

## FindMax_Game



In [None]:
from PIL import ImageFont

class FindMax_Game(GridWorld):
    
    def __init__(self, board_dims, start_state, reward_matrix):
        # the terminal state is where the max value locates
        self.terminal_state = np.unravel_index(np.argmax(reward_matrix, axis=None), board_dims)
        GridWorld.__init__(self, board_dims, start_state, self.terminal_state, reward_matrix)
        # self.max_value = reward_matrix[self.terminal_state]
        self.reward_matrix = reward_matrix
        # initial sum of rewards is the reward value at the start state
        self.sum = reward_matrix[start_state] 
        # the number of steps
        self.n_steps = 0
        # the step limit is the Manhattan distance between the start state and the terminal state
        self.step_limit = sum(abs(np.array(self.terminal_state)-np.array(start_state)))
        

        

    def draw(self):

        image = Image.new("RGBA", (501, 501), (255, 255, 255, 255)) # white
        draw = ImageDraw.Draw(image)

        w = 500/(self.width)
        h = 500/(self.height)
        color = (255, 255, 255, 0) # white, transparent

        # use !fc-list or !fc-list | grep "" to get the path of the font-type on colab
        font = ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf", 40) 
        
        
        #Draw Grid
        for i in range(self.height):
            for j in range(self.width):
                if(i == self.start_state[0] and j == self.start_state[1]):
                    color = (0, 0, 255, 255)  # blue for the starting point
                if(i == self.terminal_state[0] and j == self.terminal_state[1]):
                    color = (255, 0, 0, 255)  # red for the max
                draw.rectangle(((0+j*w, 0+i*h),(w+j*w, h+i*h)), outline = "black", fill = color)
                text_h = font.getsize(str(self.reward_matrix[(i,j)]))[1]
                text_w = font.getsize(str(self.reward_matrix[(i,j)]))[0]
                draw.text((j*w + w/2 - text_w/2, i*h + h/2 - text_h/2), 
                          str(self.reward_matrix[(i,j)]), font=font, fill=(0, 0, 0, 255))
                color = (255, 255, 255, 0)

        #Draw Agent
        agent_layer = Image.new('RGBA', (501, 501), (255, 255, 255, 0))
        draw2 = ImageDraw.Draw(agent_layer)
        draw2.ellipse((self.agent_pos[1]*w + w/4, self.agent_pos[0]*h + h/4, 3*w/4 + self.agent_pos[1]*w, 3*h/4 + self.agent_pos[0]*h), 
                     fill=(255, 0, 255, 128)) 
      
        out = Image.alpha_composite(image, agent_layer)
        display(out)

      
    def update_state(self,new_pos, if_illegal):
        """
        new_pos - a point in the game grid that the agent has moved to
        a function to update the position of agent, number of steps, and sum of rewards,
        """
        if not if_illegal:
            self.agent_pos = new_pos
        self.n_steps += 1 # once the agent moves, the number of steps taken (var n_step) +1
        # self.remaining_prize_states -= 1
        self.sum += reward_matrix[self.agent_pos] 


    # def update_sum(self, new_pos):
    #   """
    #   new_pos - a point in the game grid that the agent has moved to
    #   This function updates the sum of rewards
    #   """
    #   self.sum += reward_matrix[new_pos] 


    def is_episode_terminal(self):
      # return True if self.n_steps == self.step_limit else False
      return True if self.agent_pos == self.terminal_state else False


    def refresh_game(self):
        self.agent_pos = self.start_state
        self.sum = self.reward_matrix[self.start_state] 
        self.n_steps = 0
        # self.remaining_prize_states = self.step_limit

## MaxPath_Game

In [None]:
from PIL import ImageFont

class MaxPath_Game(GridWorld):
    
    def __init__(self, board_dims, start_state, reward_matrix):
        self.max_state = np.unravel_index(np.argmax(reward_matrix, axis=None), board_dims)
        # no fixed terminal state, this game will terminate when the step limit is reached 
        GridWorld.__init__(self, board_dims, start_state, None, reward_matrix) 
        # self.max_value = reward_matrix[self.max_state]
        self.reward_matrix = reward_matrix
        # initial sum of rewards is the reward value at the start state
        # self.sum = reward_matrix[start_state] 
        self.sum = 0 # initial sum of rewards is 0
        # the number of steps
        self.n_steps = 0
        # the step limit is (board_width - 1)+(board_height - 1)
        self.step_limit = (self.width - 1)+(self.height - 1)


    def draw(self):

        image = Image.new("RGBA", (501, 501), (255, 255, 255, 255)) # white
        draw = ImageDraw.Draw(image)

        w = 500/(self.width)
        h = 500/(self.height)
        color = (255, 255, 255, 0) # white, transparent

        # use !fc-list or !fc-list | grep "" to get the path of the font-type on colab
        font = ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf", 40) 
        
        
        #Draw Grid 
        for i in range(self.height):
            for j in range(self.width):
                if(i == self.start_state[0] and j == self.start_state[1]):
                    color = (0, 0, 255, 255)  # blue for the starting point
                if(i == self.max_state[0] and j == self.max_state[1]):
                    color = (255, 0, 0, 255)  # red for the max
                draw.rectangle(((0+j*w, 0+i*h),(w+j*w, h+i*h)), outline = "black", fill = color)
                text_h = font.getsize(str(self.reward_matrix[(i,j)]))[1]
                text_w = font.getsize(str(self.reward_matrix[(i,j)]))[0]
                draw.text((j*w + w/2 - text_w/2, i*h + h/2 - text_h/2), 
                          str(self.reward_matrix[(i,j)]), font=font, fill=(0, 0, 0, 255))
                color = (255, 255, 255, 0)

        #Draw Agent
        agent_layer = Image.new('RGBA', (501, 501), (255, 255, 255, 0))
        draw2 = ImageDraw.Draw(agent_layer)
        draw2.ellipse((self.agent_pos[1]*w + w/4, self.agent_pos[0]*h + h/4, 3*w/4 + self.agent_pos[1]*w, 3*h/4 + self.agent_pos[0]*h), 
                     fill=(255, 0, 255, 128)) 
      
        out = Image.alpha_composite(image, agent_layer)
        display(out)

      
    def update_state(self, new_pos, if_illegal):
        """
        new_pos - a point in the game grid that the agent has moved to
        a function to update the position of agent, number of steps, and sum of rewards,
        """
        if not if_illegal:
            self.agent_pos = new_pos

        self.n_steps += 1 # once the agent moves, the number of steps taken (var n_step) +1
        # self.remaining_prize_states -= 1
        self.sum += reward_matrix[self.agent_pos] 


    # def update_sum(self, new_pos):
    #   """
    #   new_pos - a point in the game grid that the agent has moved to
    #   This function updates the sum of rewards
    #   """
    #   self.sum += reward_matrix[new_pos] 


    def is_episode_terminal(self):
        return True if self.n_steps >= self.step_limit else False


    def refresh_game(self):
        self.agent_pos = self.start_state
        self.sum = self.reward_matrix[self.start_state] 
        self.n_steps = 0
        # self.remaining_prize_states = self.step_limit

# Players and Agents

Here we define different agents and allow them to play our games that we define above in the previous section

For Tabular solving agents we've made Monte Carlo Agents, Temporal Difference Agents, and Planning Agents. Monte Carlo and Temporal Difference Agents learn strictly from real experience (playing the game) and the planning agent learns from both real experience and simulated experience (build a model of the game's reward matrix

In [None]:
class QTable:
    """
    This class implements the qtable object used for all tabular agent qtable classes
      This class includes parameters that are used in every tabular agent
    """
    def __init__(self, Game, alpha, epsilon, discount, alpha_decay_rate = 0.6, epsilon_decay_rate=1.0):
        self.Game = Game
        self.qtable = np.zeros([Game.height*Game.width, len(self.Game.moves)])
        self.alpha = alpha
        self.discount = discount
        self.epsilon = epsilon
        self.alpha_decay_rate = alpha_decay_rate
        self.epsilon_decay_rate = epsilon_decay_rate

    def alpha_decay(self):
        self.alpha = self.alpha*self.alpha_decay_rate
        
    def epsilon_decay(self):
        self.epsilon = self.epsilon*self.epsilon_decay_rate
        
    def get_return(self, state, action):
        """
        state - some integer index corresponding to a tuple point in the grid game
        action - some integer index corresponding to a possible action taken in the grid game
        This function returns the expected return from the qtable for a specific action made in a specific state
        """
        return self.qtable[state, action]
      

## Monte Carlo

Monte Carlo agents accumulate state,action,reward tuples for an entire episode before updating their action-value functions. We consider two cases of Monte Carlo RL agents: on-policy and off-policy. On-policy agents have one policy that determines the agent's behavior in the environment and converges to an optimal policy over time. Off-policy agents have two policies, one that the determines the agent's behavior in the environment (behavior policy) and one that corresponds to the agents optimal policy after sufficient experience (target policy)

### MCAgent

In [None]:
class MCAgent:
    """
    This class implements a Monte Carlo agent that can play grid games
    """
    def __init__(self, Game, model):
        self.Game = Game
        self.model = model
        self.target_policy = None
        
    def get_target_policy(self):
        """
        This function obtains the target policy derived (greedily) from the qtable
        """
        self.target_policy = self.model.target_policy

    def print_models(self):
        self.model.print_models()

    def play_episode(self):
        """
        This function plays an episode of the grid game and keeps track of the 
        states visited, actions made, and rewards obtained during the episode. 
        Once completion of the episode, we update our qtable model with the episode story
        """
        t = 0
        episode_reward = 0
        episode_story = []
        #while agent is not in a terminal state
        while not self.Game.is_episode_terminal():
            t += 1
            #Take action A, get reward R, step into s'
            
            current_state = self.Game.agent_pos
            #get possible moves going up, left, down, and right
            new_action = self.model.get_action(self.Game.agent_pos)
            new_state = tuple(np.array(self.Game.agent_pos) + np.array(new_action))
            
            
            #if move is illegal (going off the board), set reward to very bad
            if self.Game.illegal_move(new_state):
                reward = -100
                self.Game.update_state(tuple(new_state), True)
            else:
            #else the selected move is legal and we should get reward r for agent going to state s'
                reward = self.Game.get_reward(new_state)
                self.Game.update_state(tuple(new_state), False)
            
            #convert to qtable indices
            state = self.Game.point_to_index(current_state)
            action = self.Game.moves.index(new_action)
            
            #add s_t, a_t, r_t+1 to the episode story
            episode_story.append(tuple((state,action,reward)))
            
            episode_reward += reward
        
        self.model.update_model(episode_story)
        self.model.alpha_decay()
        self.model.epsilon_decay()
        return episode_reward
                
    
    def play_game(self, episodes, output=False):
        """
        episodes - an integer that corresponds to the number of times your agent plays the game
        This function has your agent play the game and update its model of the game
        """
        player_scores = []
        #number of times player plays the game is episodes.

        self.Game.draw()
        
        for i in range(episodes):
            #in each episode, the player needs to complete the task in T steps (t = 0,1,...,t-2,t-1,T)
            episode_reward = self.play_episode()
            player_scores.append(episode_reward)
            self.Game.refresh_game()
            
            print("Reward for Episode: ",i," -> ",episode_reward)

        print("Player scores for every episode: ",player_scores)
        self.Game.refresh_game()
        self.Game.draw()

### On-Policy First Visit QTable Model

In [None]:
class MC_OnPolicy_FirstVisit_QTable(QTable):
    """
    This class represents the qtable with on-policy first visit Monte Carlo update rules
    """
    def __init__(self, Game, alpha=0.1, epsilon=0.5, discount=0.5, alpha_decay_rate = 0.9, epsilon_decay_rate=0.9):
        QTable.__init__(self, Game, alpha, epsilon, discount, alpha_decay_rate, epsilon_decay_rate)
        self.target_policy = np.ones_like(self.qtable) / len(self.Game.moves)
        
    def print_models(self):
        np.set_printoptions(suppress=True)
        print(self.qtable)
        print(self.target_policy)
        
    def get_action(self, state):
        """
        state - a tuple point that corresponds to a square in a grid game
        This function obtains an action from the games possible moves based on the agent's behavior policy
        """
        probs = self.target_policy[self.Game.point_to_index(state),:]
        r = random.random()
        for move_idx, prob in enumerate(probs):
            if r > prob:
                r -= prob
            else:
                return self.Game.moves[move_idx]

        return self.Game.moves[random.randint(0,len(self.Game.moves))]
    
    def update_model(self,episode_story):
        """
        episode_story - a list of tuples corresponding to rewards obtained from 
          making a specific action from a specific state
        This function updates the policy using on-policy first visit MC methods
        """
        G = 0
        
        #for each state action reward starting from terminal state time step (T) - 1
        for t, (state,action,reward) in enumerate(reversed(episode_story)):
            G = self.discount*G + reward

            if tuple((state,action,reward)) not in episode_story[0:len(episode_story)-t-1]: #first visit condition. if the tuple has not appeard in previous stages of episode, then update its qtable values
                self.qtable[state,action] += self.alpha * (G - self.qtable[state,action])
                
                max_action_idx = np.argmax(self.qtable[state])
                #update e-greedy policy
                for a in range(0,len(self.Game.moves)):
                    #update sub-optimal actions in state to be chosen with explore probability
                    self.target_policy[state][a] = self.epsilon/len(self.Game.moves)
                #update optimal action in state to be chosen with exploit probability + explore probability
                self.target_policy[state][max_action_idx] = 1 - self.epsilon + self.epsilon/len(self.Game.moves)
            
#             print("Reward for Time Step: ",len(episode_story)-t," -> ",G, "\t State-Action: ",state,",",action)
        

### Off-Policy QTable Model

Off-policy methods utilize this important idea of importance sampling. When generating experience and rewards from the environment from some behavior policy b(a|s), the expected reward will be slightly different for the behavior policy and the target policy. Thus to correct for this, we mulitply our reward by the ratio of the probability of seeing the episode's state action trajectory occur under each policy.

In [None]:
class MC_OffPolicy_QTable(QTable):
    """
    This class represents the qtable with off-policy Monte Carlo update rules
    """
    def __init__(self, Game, alpha=0.1, epsilon=0.5, discount=0.5, alpha_decay_rate = 0.9, epsilon_decay_rate=0.9):
        QTable.__init__(self, Game, alpha, epsilon, discount, alpha_decay_rate, epsilon_decay_rate)
        self.target_policy = np.ones_like(self.qtable) / len(self.Game.moves)
        self.behavior_policy = np.ones_like(self.qtable) / len(self.Game.moves)
        self.cumul_weights = np.zeros_like(self.qtable)
        
    def print_models(self):
        np.set_printoptions(suppress=True)
        print(self.qtable)
        print(self.target_policy)
        print(self.behavior_policy)
        
    def get_action(self, state):
        """
        state - a tuple point that corresponds to a square in a grid game
        This function obtains an action from the games possible moves based on the agent's behavior policy
        """
        probs = self.behavior_policy[self.Game.point_to_index(state),:]
        r = random.random()
        for move_idx, prob in enumerate(probs):
            if r > prob:
                r -= prob
            else:
                return self.Game.moves[move_idx]

        return self.Game.moves[random.randint(0,len(self.Game.moves))]
    
    def update_model(self,episode_story):
        """
        episode_story - a list of tuples corresponding to rewards obtained from 
          making a specific action from a specific state
        This function updates the policy using off-policy MC methods and importance sampling
        """
        G = 0
        W = 1
        
        #for each (state action reward) tuple starting from terminal state time step (T) - 1
        for t, (state,action,reward) in enumerate(reversed(episode_story)):
            G = self.discount*G + reward
            self.cumul_weights[state,action] += W
            self.qtable[state,action] += (W / self.cumul_weights[state,action]) * (G - self.qtable[state,action])

            max_action_idx = np.argmax(self.qtable[state])

            #update policy deterministically (greedily) w.r.t the qtable
            for a in range(0,len(self.Game.moves)):
                self.target_policy[state][a] = 0
            self.target_policy[state][max_action_idx] = 1
            
            #update e-greedy policy
            for a in range(0,len(self.Game.moves)):
                #update sub-optimal actions in state to be chosen with explore probability
                self.behavior_policy[state][a] = self.epsilon/len(self.Game.moves)
            #update optimal action in state to be chosen with exploit probability + explore probability
            self.behavior_policy[state][max_action_idx] = 1 - self.epsilon + self.epsilon/len(self.Game.moves)
            
#             if action != max_action_idx:
#                 break
            
            W = W / self.behavior_policy[state,action]
            
#             print("Reward for Time Step: ",len(episode_story)-t," -> ",G, "\t State-Action: ",state,",",action)
        

## Temporal Difference

Temporal Difference Agents utilize Monte Carlo and Dynamic Programming ideas. If we want to update are qtable before the episode ends, we utilize temporal difference learning. The idea is to lookahead only a finite number of steps, and only factor the next n-steps into your return estimate for time step t. The popular Q-Learning method is an example of a one-step temporal difference learning method.



### one-step TDAgent

In [None]:
class TDAgent:
    """
    This class implements a one-step temporal difference agent that can play grid games
    """
    def __init__(self, Game, model):
        self.Game = Game
        self.model = model
        self.target_policy = None
        
    def get_target_policy(self):
        """
        This function obtains the target policy derived from the qtable
        """
        self.target_policy = self.model.target_policy

    def print_models(self):
        self.model.print_models()

    def play_episode(self):
        """
        This function plays an episode of the grid game and keeps track of the 
        states visited, actions made, and rewards obtained during each time step. 
        Once completion of a single time step, we update our qtable model with
        current state, action made in current state, reward for making action in
        current state, and the new state the agent moved to from making action.
        """

        t = 0
        episode_reward = 0
        #while agent is not in a terminal state
        while not self.Game.is_episode_terminal():
            #Take action A, get reward R, step into s'

            t += 1
            current_state = self.Game.agent_pos
            #get possible moves going up, left, down, and right
            new_action = self.model.get_action(self.Game.agent_pos)
            new_state = tuple(np.array(self.Game.agent_pos) + np.array(new_action))
            
            #if move is illegal (going off the board), set reward to very bad
            if self.Game.illegal_move(new_state):
                reward = -100
                self.Game.update_state(tuple(new_state), True)
            else:
            #else the selected move is legal and we should get reward r for agent going to state s'
                reward = self.Game.get_reward(new_state)
                self.Game.update_state(tuple(new_state), False)
            
            #convert to qtable indices
            state = self.Game.point_to_index(current_state)
            action = self.Game.moves.index(new_action)
            new_state = self.Game.point_to_index(self.Game.agent_pos)

            #pass s_t, a_t, r_t+1,s_t+1 to update the qtable
            self.model.update_model(tuple((state,action,reward,new_state)))

            episode_reward += reward
        
        
        self.model.alpha_decay()
        self.model.epsilon_decay()
        return tuple((t,episode_reward))
                
    
    def play_game(self, episodes, output=False):
        """
        episodes - an integer that corresponds to the number of times your agent plays the game
        This function has your agent play the game and update its model of the game
        """
        player_scores = []
        player_speeds = []
        #number of times player plays the game is episodes.

        self.Game.draw()
        
        for i in range(episodes):
            #in each episode, the player needs to complete the task in T steps (t = 0,1,...,t-2,t-1,T)
            t,episode_reward = self.play_episode()
            player_scores.append(episode_reward)
            player_speeds.append(t)
            self.Game.refresh_game()
            
            
            print("Reward for Episode: ",i," -> ",episode_reward, "\t time steps: ",t)

        print("Player scores for every episode: ",player_scores)
        self.Game.refresh_game()
        self.Game.draw()

        plt.plot(np.arange(episodes),player_speeds)

### n-step TDAgent

In [None]:
class nstepTDAgent:
    """
    This class implements an n-step temporal difference agent that can play grid games
    """
    def __init__(self, Game, model):
        self.Game = Game
        self.model = model
        self.target_policy = None
        
    def get_target_policy(self):
        """
        This function obtains the target policy derived from the qtable
        """
        self.target_policy = self.model.target_policy

    def print_models(self):
        self.model.print_models()

    def play_episode(self):
        """
        This function plays an episode of the grid game and keeps track of the 
        states visited, actions made, and rewards obtained during each time step
        in their own unique lists. Once completion of a n time steps in the episode, 
        we update our qtable model every time step with recorded states, actions,
        and rewards from the episode.
        """
        t = 0
        tau = 0
        self.model.T = 10000
        episode_reward = 0
        
        episode_rewards = []
        episode_actions = []
        episode_states = []

        #make initial move
        current_state = self.Game.agent_pos
        new_action = self.model.get_action(self.Game.agent_pos)

        state = self.Game.point_to_index(current_state)
        action = self.Game.moves.index(new_action)

        #append action_0,state_0,reward_0 to episode story lists
        episode_actions.append(action)
        episode_rewards.append(None)
        episode_states.append(state)

        #while agent is not in a terminal state
        while tau != self.model.T - 1:
            
            if t < self.model.T:

                #Take action A, get reward R, step into s'
                new_state = tuple(np.array(self.Game.agent_pos) + np.array(new_action))

                #if move is illegal (going off the board), set reward to very bad
                if self.Game.illegal_move(new_state):
                    reward = -100
                    self.Game.update_state(tuple(new_state), True)
                else:
                #else the selected move is legal and we should get reward r for agent going to state s'
                    reward = self.Game.get_reward(new_state)
                    self.Game.update_state(tuple(new_state), False)

                #update story rewards and states
                current_state = self.Game.agent_pos
                state = self.Game.point_to_index(current_state)
                episode_rewards.append(reward)
                episode_states.append(state)
                
                if self.Game.is_episode_terminal():
                    #Obtain terminal time step
                    self.model.T = t + 1
                else:
                    #get possible moves going up, left, down, and right
                    new_action = self.model.get_action(self.Game.agent_pos)
                    action = self.Game.moves.index(new_action)
                    episode_actions.append(action)

            #update model once n time steps occur so we can do n-step updates
            tau = t - self.model.n + 1
            if tau >= 0:
                self.model.update_model(tau,episode_states,episode_actions,episode_rewards)

            episode_reward += reward
            t += 1
        
        self.model.alpha_decay()
        self.model.epsilon_decay()
        return episode_reward
                
    
    def play_game(self, episodes, output=False):
        """
        episodes - an integer that corresponds to the number of times your agent plays the game
        This function has your agent play the game and update its model of the game
        """
        player_scores = []
        #number of times player plays the game is episodes.

        self.Game.draw()
        
        for i in range(episodes):
            #in each episode, the player needs to complete the task in T steps (t = 0,1,...,t-2,t-1,T)
            episode_reward = self.play_episode()
            player_scores.append(episode_reward)
            self.Game.refresh_game()
            
            print("Reward for Episode: ",i," -> ",episode_reward)

        print("Player scores for every episode: ",player_scores)
        self.Game.refresh_game()
        self.Game.draw()

### Q-Learning QTable Model

In [None]:
class QLearningQTable(QTable):
    """
    This class represents the qtable with temporal difference qlearning update rules
    """
    def __init__(self, Game, alpha=0.1, epsilon=0.5, discount=0.5, alpha_decay_rate = 0.9, epsilon_decay_rate=0.9):
        QTable.__init__(self, Game, alpha, epsilon, discount, alpha_decay_rate, epsilon_decay_rate)
        self.target_policy = np.ones_like(self.qtable) / len(self.Game.moves)

    def print_models(self):
        np.set_printoptions(suppress=True)
        print(self.qtable)
        print(self.target_policy)
        
    def get_action(self, state):
        """
        state - a tuple point that corresponds to a square in a grid game
        This function obtains an action from the games possible moves based on the agent's behavior policy
        """
        probs = self.target_policy[self.Game.point_to_index(state),:]
        r = random.random()
        for move_idx, prob in enumerate(probs):
            if r > prob:
                r -= prob
            else:
                return self.Game.moves[move_idx]

        return self.Game.moves[random.randint(0,len(self.Game.moves))]
    
    def update_model(self,SARS):
        """
        SARS - a tuple corresponding to state_t, action_t, reward_t, new_state_t+1 for time step t
        This function updates the policy using temporal difference qlearning method
        """

        state, action, reward, new_state = SARS
        #obtain return value of best action taken in new_state
        maxQ_new_state = np.max(self.qtable[new_state])
        
        #qlearning update rule
        self.qtable[state,action] += self.alpha * (reward + self.discount*maxQ_new_state - self.qtable[state,action])

        max_action_idx = np.argmax(self.qtable[state])
        #update e-greedy policy
        for a in range(0,len(self.Game.moves)):
            #update sub-optimal actions in state to be chosen with explore probability
            self.target_policy[state][a] = self.epsilon/len(self.Game.moves)
        #update optimal action in state to be chosen with exploit probability + explore probability
        self.target_policy[state][max_action_idx] = 1 - self.epsilon + self.epsilon/len(self.Game.moves)
        
#             print("Reward for Time Step: ",len(episode_story)-t," -> ",G, "\t State-Action: ",state,",",action)
        


### n-step Sarsa QTable Model

In [None]:
class nstepSarsaQTable(QTable):
    """
    This class represents the qtable with n-step temporal difference Sarsa update rules
    """
    def __init__(self, Game, n, alpha=0.1, epsilon=0.5, discount=0.5, alpha_decay_rate = 0.9, epsilon_decay_rate=0.9):
        QTable.__init__(self, Game, alpha, epsilon, discount, alpha_decay_rate, epsilon_decay_rate)
        self.behavior_policy = np.ones_like(self.qtable) / len(self.Game.moves)
        self.target_policy = np.ones_like(self.qtable) / len(self.Game.moves)
        self.n = n
        self.T = 10000

    def print_models(self):
        np.set_printoptions(suppress=True)
        print(self.qtable)
        print(self.behavior_policy)
        print(self.target_policy)
        
    def get_action(self, state):
        """
        state - a tuple point that corresponds to a square in a grid game
        This function obtains an action from the games possible moves based on the agent's behavior policy
        """
        probs = self.behavior_policy[self.Game.point_to_index(state),:]
        r = random.random()
        for move_idx, prob in enumerate(probs):
            if r > prob:
                r -= prob
            else:
                return self.Game.moves[move_idx]

        return self.Game.moves[random.randint(0,len(self.Game.moves))]
    
    def update_model(self,tau,states,actions,rewards):
        """
        tau - an integer corresponding to the time step that we want to update (state_tau, action_tau) qvalue
        states - a list of state qtable indices that represents the agent's state trajectory through an episode
        actions - a list of action qtable indices that represents the agent's action trajectory through an episode
        rewards - a list of rewards obtained by the agent during an episode
        This function updates the policy using n-step TD Sarsa update rules
        """

        #finds important sampling ratio
        rho = np.prod([self.target_policy[states[i],actions[i]] / self.behavior_policy[states[i],actions[i]] \
                       for i in range(tau+1,1+np.min([tau+self.n,self.T-1]))])
        
        #finds return
        G = np.sum([self.discount**(i-tau-1)*rewards[i] \
                    for i in range(tau+1,1+np.min([tau+self.n,self.T]))])

        #if updated time step plus number of lookahead time steps is less than terminal time step, then add to return
        if tau + self.n < self.T:
            #G = G_[tau:tau+n]
            G += self.discount**self.n * self.qtable[states[tau+self.n],actions[tau+self.n]]
        
        #n-step Sarsa update rule for state_tau, action_tau
        self.qtable[states[tau],actions[tau]] += self.alpha * rho * (G - self.qtable[states[tau],actions[tau]])

        max_action_idx = np.argmax(self.qtable[states[tau]])
        #update e-greedy policy
        for a in range(0,len(self.Game.moves)):
            #update sub-optimal actions in state to be chosen with explore probability
            self.target_policy[states[tau]][a] = self.epsilon/len(self.Game.moves)
        #update optimal action in state to be chosen with exploit probability + explore probability
        self.target_policy[states[tau]][max_action_idx] = 1 - self.epsilon + self.epsilon/len(self.Game.moves)
        
#             print("Reward for Time Step: ",len(episode_story)-t," -> ",G, "\t State-Action: ",state,",",action)
        

### n-step Tree Backup QTable Model

In [None]:
class nstepBackupQTable(QTable):
    """
    This class represents the qtable with n-step temporal difference Tree Backup update rules
    """
    def __init__(self, Game, n, alpha=0.1, epsilon=0.5, discount=0.5, alpha_decay_rate = 0.9, epsilon_decay_rate=0.9):
        QTable.__init__(self, Game, alpha, epsilon, discount, alpha_decay_rate, epsilon_decay_rate)
        self.behavior_policy = np.ones_like(self.qtable) / len(self.Game.moves)
        self.target_policy = np.ones_like(self.qtable) / len(self.Game.moves)
        self.n = n
        self.T = 10000

    def print_models(self):
        np.set_printoptions(suppress=True)
        print(self.qtable)
        print(self.behavior_policy)
        print(self.target_policy)
        
    def get_action(self, state):
        """
        state - a tuple point that corresponds to a square in a grid game
        This function obtains an action from the games possible moves based on the agent's behavior policy
        """
        probs = self.behavior_policy[self.Game.point_to_index(state),:]
        r = random.random()
        for move_idx, prob in enumerate(probs):
            if r > prob:
                r -= prob
            else:
                return self.Game.moves[move_idx]

        return self.Game.moves[random.randint(0,len(self.Game.moves))]
    
    def update_model(self,tau,states,actions,rewards):
        """
        tau - an integer corresponding to the time step that we want to update (state_tau, action_tau) qvalue
        states - a list of state qtable indices that represents the agent's state trajectory through an episode
        actions - a list of action qtable indices that represents the agent's action trajectory through an episode
        rewards - a list of rewards obtained by the agent during an episode
        This function updates the policy using n-step TD Sarsa update rules
        """
        #if updated time step plus lookahead steps is greater than terminal time step, then get terminal reward
        if tau + self.n >= self.T:
            G = rewards[self.T]
        else:
            #else calculate the expected sarsa return for time step (tau+n-1)
            G = rewards[tau+self.n] + self.discount * \
            np.sum([self.target_policy[states[tau+self.n],a]*self.qtable[states[tau+self.n],a] for a in range(0,len(self.Game.moves))])
        
        #add all expected sarsa returns at each time step k between tau and tau+n
        for k in reversed(range(tau+1,np.min([tau+self.n-1,self.T-1]))):
            G = rewards[k] + self.discount * \
            np.sum([self.target_policy[states[k],a]*self.qtable[states[k],a] for a in range(0,len(self.Game.moves)) if a != actions[k]]) \
            + self.discount * self.target_policy[states[k],actions[k]] * G
        
        #n-step tree backup update
        self.qtable[states[tau],actions[tau]] += self.alpha * (G - self.qtable[states[tau],actions[tau]])

        max_action_idx = np.argmax(self.qtable[states[tau]])
        #update e-greedy policy
        for a in range(0,len(self.Game.moves)):
            #update sub-optimal actions in state to be chosen with explore probability
            self.target_policy[states[tau]][a] = self.epsilon/len(self.Game.moves)
        #update optimal action in state to be chosen with exploit probability + explore probability
        self.target_policy[states[tau]][max_action_idx] = 1 - self.epsilon + self.epsilon/len(self.Game.moves)
        
#             print("Reward for Time Step: ",len(episode_story)-t," -> ",G, "\t State-Action: ",state,",",action)
        

## DynaQ

DynaQ is a planning agent which updates its qtable based on real experience and simulated experience derived from an environment model. This agent is more powerful because the way it updates its qtable isn't bounded by the actions taken during collection of real experience

### DynaQAgent

In [None]:
class DynaQAgent:
    """
    This class implements an integrated planning, acting, and learning agent that can play grid games
    """
    def __init__(self, Game, model):
        self.Game = Game
        self.model = model
        self.target_policy = None
        
    def get_target_policy(self):
        """
        This function obtains the target policy derived from the qtable
        """
        self.target_policy = self.model.target_policy

    def print_models(self):
        self.model.print_models()

    def play_episode(self):
        """
        This function plays an episode of the grid game and keeps track of the 
        states visited, actions made, and rewards obtained during each time step. 
        Once completion of a single time step, we update our qtable model with
        current state, action made in current state, reward for making action in
        current state, and the new state the agent moved to from making action.
        """
        t = 0
        episode_reward = 0
        #while agent is not in a terminal state
        while not self.Game.is_episode_terminal():
            #Take action A, get reward R, step into s'

            t += 1
            current_state = self.Game.agent_pos
            #get possible moves going up, left, down, and right
            new_action = self.model.get_action(self.Game.agent_pos)
            new_state = tuple(np.array(self.Game.agent_pos) + np.array(new_action))
            
            #if move is illegal (going off the board), set reward to very bad
            if self.Game.illegal_move(new_state):
                reward = -100
                self.Game.update_state(tuple(new_state), True)
            else:
                #else the selected move is legal and we should get reward r for agent going to state s'
                reward = self.Game.get_reward(new_state)
                self.Game.update_state(tuple(new_state), False)
            
            #convert to qtable indices
            state = self.Game.point_to_index(current_state)
            action = self.Game.moves.index(new_action)
            new_state = self.Game.point_to_index(self.Game.agent_pos)

            #add s_t, a_t, r_t+1, s_t+1 to the episode story
            self.model.update_model(tuple((state,action,reward,new_state)))

            episode_reward += reward
        
        
        self.model.alpha_decay()
        self.model.epsilon_decay()
        return episode_reward
                
    
    def play_game(self, episodes, output=False):
        """
        episodes - an integer that corresponds to the number of times your agent plays the game
        This function has your agent play the game and update its model of the game
        """
        player_scores = []
        #number of times player plays the game is episodes.

        self.Game.draw()
        
        for i in range(episodes):
            #in each episode, the player needs to complete the task in T steps (t = 0,1,...,t-2,t-1,T)
            episode_reward = self.play_episode()
            player_scores.append(episode_reward)
            self.Game.refresh_game()
            
            print("Reward for Episode: ",i," -> ",episode_reward)

        print("Player scores for every episode: ",player_scores)
        self.Game.refresh_game()
        self.Game.draw()

### DynaQTable

In [None]:
class DynaQTable(QTable):
    """
    This class represents the qtable with DynaQ update rules
    """
    def __init__(self, Game, n, alpha=0.1, epsilon=0.5, discount=0.5, alpha_decay_rate = 0.9, epsilon_decay_rate=0.9):
        QTable.__init__(self, Game, alpha, epsilon, discount, alpha_decay_rate, epsilon_decay_rate)
        self.target_policy = np.ones_like(self.qtable) / len(self.Game.moves)
        self.environment_model = np.zeros(self.qtable.shape,dtype = 'i,i') #model(s,a) = (r,s')
        self.n = n
        self.observed = []

    def print_models(self):
        np.set_printoptions(suppress=True)
        print(self.qtable)
        print(self.target_policy)
        
    def get_action(self, state):
        """
        state - a tuple point that corresponds to a square in a grid game
        This function obtains an action from the games possible moves based on the agent's behavior policy
        """
        probs = self.target_policy[self.Game.point_to_index(state),:]
        r = random.random()
        for move_idx, prob in enumerate(probs):
            if r > prob:
                r -= prob
            else:
                return self.Game.moves[move_idx]

        return self.Game.moves[random.randint(0,len(self.Game.moves))]
    
    def update_model(self,SARS):
        """
        SARS - a tuple corresponding to state_t, action_t, reward_t, new_state_t+1 for time step t
        This function updates the policy using temporal difference qlearning method
        """

        state, action, reward, new_state = SARS
        maxQ_new_state = np.max(self.qtable[new_state])

        self.qtable[state,action] += self.alpha * (reward + self.discount*maxQ_new_state - self.qtable[state,action])

        self.environment_model[state,action] = (reward,new_state)

        if tuple((state,action)) not in self.observed:
            self.observed.append(tuple((state,action)))

        #generate simulated experience from your model of the environment to update your qtable
        for n in range(0,self.n):
            #select random state action pair already observed
            state,action = self.observed[random.randint(0,len(self.observed))]
            #get model reward and next state from state action pair sampled
            reward,new_state = self.environment_model[state,action]
            maxQ_new_state = np.max(self.qtable[new_state])

            #update qtable with dynaQ update rules (qlearning)
            self.qtable[state,action] += self.alpha * (reward + self.discount*maxQ_new_state - self.qtable[state,action])

        max_action_idx = np.argmax(self.qtable[state])
        #update e-greedy policy
        for a in range(0,len(self.Game.moves)):
            #update sub-optimal actions in state to be chosen with explore probability
            self.target_policy[state][a] = self.epsilon/len(self.Game.moves)
        #update optimal action in state to be chosen with exploit probability + explore probability
        self.target_policy[state][max_action_idx] = 1 - self.epsilon + self.epsilon/len(self.Game.moves)
        
#             print("Reward for Time Step: ",len(episode_story)-t," -> ",G, "\t State-Action: ",state,",",action)
        

# Testing

This is where we make our game examples and test all the code from above

## Make New Grid

In [None]:
def make_maxgame_grid(board_size):
    n,m = board_size
    rewards = list(np.arange(0,m*n))
    reward_idxs = sample(rewards, len(rewards))

    start_state_idx = reward_idxs.index(0)
    reward_matrix = np.reshape(reward_idxs, (n,m))
    start_state = (start_state_idx // m, start_state_idx % m)
    
    return board_size, start_state, reward_matrix

## Target Game Example

### MC On-Policy Testing

In [None]:
reward_matrix = np.ones((3,3))*-1
reward_matrix[(2,2)] = 0
print(reward_matrix)
target_game = Targeting_Game( (3,3), (0,0), (2,2), reward_matrix)

on_policy_qtable = MC_OnPolicy_FirstVisit_QTable(target_game, \
                                       alpha=0.2, epsilon=0.5, discount=0.5, \
                                       alpha_decay_rate = 0.9, epsilon_decay_rate=0.9)

agent = MCAgent(target_game, on_policy_qtable)
agent.play_game(1000)
agent.print_models()

### MC Off-Policy Testing

In [None]:
reward_matrix = np.ones((3,3))*-1
reward_matrix[(2,2)] = 0
print(reward_matrix)
target_game = Targeting_Game( (3,3), (0,0), (2,2), reward_matrix)

off_policy_qtable = MC_OffPolicy_QTable(target_game, \
                                       alpha=0.1, epsilon=0.4, discount=0.2, \
                                       alpha_decay_rate = 0.9, epsilon_decay_rate=1)

agent = MCAgent(target_game, off_policy_qtable)
agent.play_game(1000)
agent.print_models()

### TD Q-Learning Testing

In [None]:
reward_matrix = np.ones((5,5))*-1
reward_matrix[(4,4)] = 0
print(reward_matrix)
target_game = Targeting_Game( (5,5), (0,0), (4,4), reward_matrix)

qlearning_qtable = QLearningQTable(target_game, \
                                       alpha=0.2, epsilon=0.8, discount=0.95, \
                                       alpha_decay_rate = 0.9, epsilon_decay_rate=0.99)

agent = TDAgent(target_game, qlearning_qtable)
agent.play_game(1000)
agent.print_models()

### n-step Sarsa Testing

In [None]:
reward_matrix = np.ones((5,5))*-1
reward_matrix[(3,3)] = 0
print(reward_matrix)
target_game = Targeting_Game( (5,5), (0,0), (3,3), reward_matrix)

nstep_sarsa_qtable = nstepSarsaQTable(target_game, n = 3, \
                                       alpha=0.2, epsilon=0.8, discount=0.2, \
                                       alpha_decay_rate = 0.99, epsilon_decay_rate=0.99)

agent = nstepTDAgent(target_game, nstep_sarsa_qtable)
agent.play_game(1000)
agent.print_models()

### n-step Tree Backup Testing

In [None]:
reward_matrix = np.ones((5,5))*-1
reward_matrix[(3,3)] = 0
print(reward_matrix)
target_game = Targeting_Game( (5,5), (0,0), (3,3), reward_matrix)

nstep_backup_qtable = nstepBackupQTable(target_game, n = 1, \
                                       alpha=0.2, epsilon=0.8, discount=0.2, \
                                       alpha_decay_rate = 0.99, epsilon_decay_rate=0.99)

agent = nstepTDAgent(target_game, nstep_backup_qtable)
agent.play_game(1000)
agent.print_models()

### DynaQ Testing

In [None]:
reward_matrix = np.ones((5,5))*-1
reward_matrix[(3,3)] = 0
print(reward_matrix)
target_game = Targeting_Game( (5,5), (0,0), (3,3), reward_matrix)

dynaq_qtable = DynaQTable(target_game, n = 100, \
                                       alpha=0.2, epsilon=0.5, discount=0.2, \
                                       alpha_decay_rate = 0.9, epsilon_decay_rate=0.9)

agent = DynaQAgent(target_game, dynaq_qtable)
agent.play_game(1000)
agent.print_models()

## Collection Game Example

In [None]:
reward_matrix = np.ones((5,5))*-1
reward_matrix[(2,2)] = 0
reward_matrix[(4,4)] = 0

collection_game = Collection_Game( (5,5), (0,0), [(2,2),(4,4)],reward_matrix)

qlearning_qtable = QLearningQTable(collection_game, \
                                       alpha=0.2, epsilon=0.8, discount=0.95, \
                                       alpha_decay_rate = 0.9, epsilon_decay_rate=0.99)

agent = TDAgent(collection_game, qlearning_qtable)
agent.play_game(1000)
agent.print_models()


## FindMax Game Example

In [None]:
board_size, start_state, reward_matrix = make_maxgame_grid((4,4))

In [None]:
max_game = FindMax_Game(board_size, start_state, reward_matrix)

qtable = DynaQTable(max_game, 0.2, 0.5)
agent = DynaQAgent(max_game, qtable, 0.5, 50, 1)
agent.play_game(200)

np.set_printoptions(suppress=True)
print(agent.QTable.matrix)
agent.get_target_policy()
agent.print_policies()

In [None]:
max_game = FindMax_Game(board_size, start_state, reward_matrix)

qlearning_qtable = QLearningQTable(max_game, \
                                       alpha=0.2, epsilon=0.8, discount=0.95, \
                                       alpha_decay_rate = 0.9, epsilon_decay_rate=0.99)

agent = TDAgent(max_game, qlearning_qtable)
agent.play_game(1000)
agent.print_models()

In [None]:
reward_matrix = np.asarray([[1,2,3,4,5], [6,7,8,9,10], [11,12,13,14,15], [16,17,18,19,20],[21,22,23,24,25]])

max_game = FindMax_Game( (5,5), (2,0), reward_matrix)

qlearning_qtable = QLearningQTable(max_game, \
                                       alpha=0.2, epsilon=0.8, discount=0.95, \
                                       alpha_decay_rate = 0.9, epsilon_decay_rate=0.99)

agent = TDAgent(max_game, qlearning_qtable)
agent.play_game(1000)
agent.print_models()

## MaxPath Game Example

In [None]:
reward_matrix = np.asarray([[1,2,3,4,5], [6,7,8,9,10], [11,12,13,14,15], [16,17,18,19,20],[21,22,23,24,25]])

max_game = MaxPath_Game( (5,5), (0,0), reward_matrix)

qlearning_qtable = QLearningQTable(max_game, \
                                       alpha=0.2, epsilon=0.8, discount=0.95, \
                                       alpha_decay_rate = 0.9, epsilon_decay_rate=0.99)

agent = TDAgent(max_game, qlearning_qtable)
agent.play_game(1000)
agent.print_models()

# Old Code

In [None]:
class DynaQAgentOLD:
    """
    This class implements a temporal difference off-policy (qlearning) agent that can play grid games
    """
    def __init__(self, Game, QTable, epsilon, policy_update_steps=50, n_planning_steps = 50, epsilon_decay_rate=1.0):
        self.Game = Game
        self.QTable = QTable
        self.planning_model = np.zeros_like(self.QTable.matrix)
        self.behavior_policy = np.ones_like(self.QTable.matrix) / len(self.Game.moves)
        self.target_policy = None
        self.epsilon = epsilon
        self.policy_update_steps = policy_update_steps
        self.n_planning_steps = n_planning_steps
        self.epsilon_decay_rate = epsilon_decay_rate # \in [0.5, 1]
        
    def get_target_policy(self):
        """
        This function obtains the deterministic (greedy) policy from the qtable
        """
        self.QTable.get_target_policy()
        self.target_policy = self.QTable.target_policy
    
    def update_models(self, state, action, reward, new_state):
        """
        state - a tuple point corresponding to a square in a grid game
        action - a tuple move corresponding to an action made in a grid game
        reward - the reward derived from the game's reward matrix for specific action taken in specific state
        new_state - a tuple point corresponding to the square in the grid game that your agent moved to 
            from taking the action described above in the state described above
        This function updates the qtable by calling update_qtable and passing the necessary information about the game environment
        """
        new_state_idx = self.Game.point_to_index(new_state)
        state_idx = self.Game.point_to_index(state)
        action_idx = self.Game.moves.index(action)
        self.QTable.update_qtable(state_idx, action_idx, reward, new_state_idx)
        self.planning_model[state_idx, action_idx] = reward
    
    def update_planning(self):
        for n in range(self.n_planning_steps):
            random_state_idx = random.randint(0, self.Game.width*self.Game.height)
            random_action_idx = random.randint(0,len(self.Game.moves))
            reward = self.planning_model[random_state_idx, random_action_idx]
            new_state_idx = self.Game.point_to_index(tuple(np.array(self.Game.agent_pos) + np.array(self.Game.moves[random_action_idx])))
            self.QTable.update_qtable(random_state_idx, random_action_idx, reward, new_state_idx)

    def get_action(self, state):
        """
        state - a tuple point that corresponds to a square in a grid game
        This function obtains an action from the games possible moves based on the agent's behavior policy
        """
        probs = self.behavior_policy[self.Game.point_to_index(state),:]
        r = random.random()
        for move_idx, prob in enumerate(probs):
            if r > prob:
                r -= prob
            else:
                return self.Game.moves[move_idx]

        return self.Game.moves[random.randint(0,len(self.Game.moves))]


    def epsilon_decay (self):
      self.epsilon = self.epsilon*self.epsilon_decay_rate


    def update_behavior_policy(self):
        "Updates behavior table with the probability values of a given state index, must update QTable first"
        for state_idx in range(len(self.behavior_policy)):
            max_action_idx = np.argmax(self.QTable.matrix[state_idx])
        
            for x in range(0,len(self.Game.moves)):
                self.behavior_policy[state_idx][x] = self.epsilon/len(self.Game.moves)
            self.behavior_policy[state_idx][max_action_idx] = 1-self.epsilon + self.epsilon/len(self.Game.moves)
        
        self.update_planning()
        self.epsilon_decay()
        self.print_policies()
        print(self.QTable.matrix)

        
    def update_terminal_state_returns(self):
        #learn max lookahead for actions in the terminal state
        terminal_state = self.Game.agent_pos
        for iters in range(5*len(self.Game.moves)):
            self.Game.agent_pos = terminal_state

            #get possible moves going up, left, down, and right
            new_action = self.get_action(self.Game.agent_pos)
            new_state = tuple(np.array(self.Game.agent_pos) + np.array(new_action))

            #if move is illegal (going off the board), set reward to very bad
            if self.Game.illegal_move(new_state):
                reward = -10000
                self.update_models(self.Game.agent_pos, new_action, reward, self.Game.agent_pos)
                self.Game.update_state(tuple(new_state), True)
            else:
            #else the selected move is legal and we should get reward r for agent going to state s'
                reward = self.Game.get_reward(new_state)
                self.update_models(self.Game.agent_pos, new_action, reward, new_state)
                self.Game.update_state(tuple(new_state), False)

        self.QTable.incidence = np.zeros_like(self.QTable.matrix)

    def print_policies(self):
        print("Behavior Policy")
        print(self.behavior_policy)
        print("Target Policy")
        print(self.target_policy)
        print("Planning Model")
        print(self.planning_model)

    def play_game(self, episodes, output=False):
        """
        episodes - an integer that corresponds to the number of times your agent plays the game
        This function has your agent play the game and update its model of the game
        """
        player_scores = []
        #number of times player plays the game is episodes.

        self.Game.draw()
        
        for i in range(episodes):
            #in each episode, the player needs to complete the task in T steps (t = 0,1,...,t-2,t-1,T)

            t = 0
            episode_reward = 0
            
            if (i % self.policy_update_steps) + 1 == self.policy_update_steps:
                print("update at step: ",i)
                self.update_behavior_policy()

            #while agent is not in a terminal state
            while not self.Game.is_episode_terminal():
                #Take action A, get reward R, step into s'
                #Find a that is max(s',a)

                if output:
                    print("Player's position",self.Game.agent_pos)

                t += 1

                #get possible moves going up, left, down, and right
                new_action = self.get_action(self.Game.agent_pos)
                new_state = tuple(np.array(self.Game.agent_pos) + np.array(new_action))

                #if move is illegal (going off the board), set reward to very bad
                if self.Game.illegal_move(new_state):
                    reward = -10000
                    self.update_models(self.Game.agent_pos, new_action, reward, self.Game.agent_pos)
                    self.Game.update_state(tuple(new_state), True)
                else:
                #else the selected move is legal and we should get reward r for agent going to state s'
                    reward = self.Game.get_reward(new_state)
                    self.update_models(self.Game.agent_pos, new_action, reward, new_state)
                    self.Game.update_state(tuple(new_state), False)

                episode_reward += reward

                
#                 print("Player's possible moves: ",self.Game.get_moves(self.Game.agent_pos))
#                 print("Player's chosen move",new_state)
#                 print("Player's new position",self.Game.agent_pos)
#                 print("Player's reward for this move",reward)
#                 print("Number of moves made to perform task: ",t)

                # if self.Game.terminal_state is not None and self.Game.agent_pos in self.Game.terminal_state:
                #     print("Player found the target square: ",self.Game.terminal_state)

#                 self.Game.draw()
                
            self.update_terminal_state_returns()

            player_scores.append(episode_reward)

            print("Player finished task in :",t, " moves")
            self.Game.refresh_game()

        print("Player scores for every episode: ",player_scores)
        self.Game.refresh_game()
        self.Game.draw()

In [None]:
class DynaQTableOLD:
    """
    This class implements the qtable object with qlearning update rules
    """
    def __init__(self, Game, alpha, discount, alpha_decay_rate = 0.6):
        self.Game = Game
        self.matrix = np.zeros([Game.height*Game.width, len(Game.moves)])
        self.alpha = alpha
        self.discount = discount
        self.target_policy = np.ones_like(self.matrix) / len(Game.moves)
        self.alpha_decay_rate = alpha_decay_rate

    def alpha_decay(self):
        self.alpha = self.alpha*self.alpha_decay_rate

    def get_target_policy(self):
        """
        This function gets the deterministic (greedy) policy derived from the qtable
        """
        for state_idx in range(len(self.target_policy)):
            max_action_idx = np.argmax(self.matrix[state_idx])
        
            for x in range(0,len(self.Game.moves)):
                self.target_policy[state_idx][x] = 0
            self.target_policy[state_idx][max_action_idx] = 1
        
    def get_return(self, state, action):
        """
        state - some integer index corresponding to a tuple point in the grid game
        action - some integer index corresponding to a possible action taken in the grid game
        This function returns the expected return from the qtable for a specific action made in a specific state
        """
        return self.matrix[state, action]

    def max_lookahead(self, state):
        """
        state - this is s', a point in the grid where are agent is going to be when taking action a in state s
        This function finds the best action which maximizes the next move made from s' (state)
        """
        aprime = np.argmax(self.matrix[state, :])
        return self.get_return(state,aprime)

    def update_qtable(self, state, action, reward, new_state):
        """
        state - a point on the grid where your agent is performing an action from (converted to index in qtable)
        action - this is an integer corresponding to a tuple that makes the agent move up, left, down, or right and represents 
          the selected move made by the agent in the state provided above
        (state, action) pair provided above can be plugged directly into qtable
        reward - this is an integer corresponding to the reward for the above (state, action) tuple
        """
        # action equals 0, 1, 2, 3
        current_reward = self.matrix[state,action]
        self.matrix[state, action] = current_reward + self.alpha * (reward + self.discount * self.max_lookahead(new_state) - current_reward)
      

In [None]:
class QLearningTable:
    """
    This class implements the qtable object with qlearning update rules
    """
    def __init__(self, Game, alpha, discount, alpha_decay_rate = 0.6):
        self.Game = Game
        self.matrix = np.zeros([Game.height*Game.width, len(Game.moves)])
        self.alpha = alpha
        self.discount = discount
        self.target_policy = np.ones_like(self.matrix) / len(Game.moves)
        self.incidence = np.zeros_like(self.matrix)
        self.alpha_decay_rate = alpha_decay_rate

    def alpha_decay(self):
        self.alpha = self.alpha*self.alpha_decay_rate

    def get_target_policy(self):
        """
        This function gets the deterministic (greedy) policy derived from the qtable
        """
        for state_idx in range(len(self.target_policy)):
            max_action_idx = np.argmax(self.matrix[state_idx])
        
            for x in range(0,len(self.Game.moves)):
                self.target_policy[state_idx][x] = 0
            self.target_policy[state_idx][max_action_idx] = 1
        
    def get_return(self, state, action):
        """
        state - some integer index corresponding to a tuple point in the grid game
        action - some integer index corresponding to a possible action taken in the grid game
        This function returns the expected return from the qtable for a specific action made in a specific state
        """
        return self.matrix[state, action]

    def max_lookahead(self, state):
        """
        state - this is s', a point in the grid where are agent is going to be when taking action a in state s
        This function finds the best action which maximizes the next move made from s' (state)
        """
        aprime = np.argmax(self.matrix[state, :])
        return self.get_return(state,aprime)

    def update_qtable(self, state, action, reward, new_state):
        """
        state - a point on the grid where your agent is performing an action from (converted to index in qtable)
        action - this is an integer corresponding to a tuple that makes the agent move up, left, down, or right and represents 
          the selected move made by the agent in the state provided above
        (state, action) pair provided above can be plugged directly into qtable
        reward - this is an integer corresponding to the reward for the above (state, action) tuple
        """
        # action equals 0, 1, 2, 3
        if self.incidence[state,action] == 0:
            current_reward = self.matrix[state,action]
            self.matrix[state, action] = current_reward + self.alpha * (reward + self.discount * self.max_lookahead(new_state) - current_reward)
            self.incidence[state,action] == 1
      

In [None]:
class QLearningAgent:
    """
    This class implements a temporal difference off-policy (qlearning) agent that can play grid games
    """
    def __init__(self, Game, QTable, epsilon, policy_update_steps=50, epsilon_decay_rate=1.0):
        self.Game = Game
        self.QTable = QTable
        self.behavior_policy = np.ones_like(self.QTable.matrix) / len(self.Game.moves)
        self.target_policy = None
        self.epsilon = epsilon
        self.policy_update_steps = policy_update_steps
        self.epsilon_decay_rate = epsilon_decay_rate # \in [0.5, 1]
        
    def get_target_policy(self):
        """
        This function obtains the deterministic (greedy) policy from the qtable
        """
        self.QTable.get_target_policy()
        self.target_policy = self.QTable.target_policy
    
    def update_qtable(self, state, action, reward, new_state):
        """
        state - a tuple point corresponding to a square in a grid game
        action - a tuple move corresponding to an action made in a grid game
        reward - the reward derived from the game's reward matrix for specific action taken in specific state
        new_state - a tuple point corresponding to the square in the grid game that your agent moved to 
            from taking the action described above in the state described above
        This function updates the qtable by calling update_qtable and passing the necessary information about the game environment
        """
        new_state_idx = self.Game.point_to_index(new_state)
        state_idx = self.Game.point_to_index(state)
        action_idx = self.Game.moves.index(action)
        self.QTable.update_qtable(state_idx, action_idx, reward, new_state_idx)
    
    def get_action(self, state):
        """
        state - a tuple point that corresponds to a square in a grid game
        This function obtains an action from the games possible moves based on the agent's behavior policy
        """
        probs = self.behavior_policy[self.Game.point_to_index(state),:]
        r = random.random()
        for move_idx, prob in enumerate(probs):
            if r > prob:
                r -= prob
            else:
                return self.Game.moves[move_idx]

        return self.moves[random.randint(0,len(self.moves))]


    def epsilon_decay (self):
      self.epsilon = self.epsilon*self.epsilon_decay_rate


    def update_behavior_policy(self):
        "Updates behavior table with the probability values of a given state index, must update QTable first"
        for state_idx in range(len(self.behavior_policy)):
            max_action_idx = np.argmax(self.QTable.matrix[state_idx])
        
            for x in range(0,len(self.Game.moves)):
                self.behavior_policy[state_idx][x] = self.epsilon/len(self.Game.moves)
            self.behavior_policy[state_idx][max_action_idx] = 1-self.epsilon + self.epsilon/len(self.Game.moves)
        self.epsilon_decay()
        self.QTable.alpha_decay()
        self.print_policies()
        print(self.QTable.matrix)

        
    def update_terminal_state_returns(self):
        #learn max lookahead for actions in the terminal state
        terminal_state = self.Game.agent_pos
        for iters in range(5*len(self.Game.moves)):
            self.Game.agent_pos = terminal_state

            #get possible moves going up, left, down, and right
            new_action = self.get_action(self.Game.agent_pos)
            new_state = tuple(np.array(self.Game.agent_pos) + np.array(new_action))

            #if move is illegal (going off the board), set reward to very bad
            if self.Game.illegal_move(new_state):
                reward = -10000
                self.update_qtable(self.Game.agent_pos, new_action, reward, self.Game.agent_pos)
                self.Game.update_state(tuple(new_state), True)
            else:
            #else the selected move is legal and we should get reward r for agent going to state s'
                reward = self.Game.get_reward(new_state)
                self.update_qtable(self.Game.agent_pos, new_action, reward, new_state)
                self.Game.update_state(tuple(new_state), False)

        self.QTable.incidence = np.zeros_like(self.QTable.matrix)

    def print_policies(self):
        print("Behavior Policy")
        print(self.behavior_policy)
        print("Target Policy")
        print(self.target_policy)

    def play_game(self, episodes, output=False):
        """
        episodes - an integer that corresponds to the number of times your agent plays the game
        This function has your agent play the game and update its model of the game
        """
        player_scores = []
        #number of times player plays the game is episodes.

        self.Game.draw()
        
        for i in range(episodes):
            #in each episode, the player needs to complete the task in T steps (t = 0,1,...,t-2,t-1,T)

            t = 0
            episode_reward = 0
            
            if (i % self.policy_update_steps) + 1 == self.policy_update_steps:
                print("update at step: ",i)
                self.update_behavior_policy()

            #while agent is not in a terminal state
            while not self.Game.is_episode_terminal():
                #Take action A, get reward R, step into s'
                #Find a that is max(s',a)

                if output:
                    print("Player's position",self.Game.agent_pos)

                t += 1

                #get possible moves going up, left, down, and right
                new_action = self.get_action(self.Game.agent_pos)
                new_state = tuple(np.array(self.Game.agent_pos) + np.array(new_action))

                #if move is illegal (going off the board), set reward to very bad
                if self.Game.illegal_move(new_state):
                    reward = -10000
                    self.update_qtable(self.Game.agent_pos, new_action, reward, self.Game.agent_pos)
                    self.Game.update_state(tuple(new_state), True)
                else:
                #else the selected move is legal and we should get reward r for agent going to state s'
                    reward = self.Game.get_reward(new_state)
                    self.update_qtable(self.Game.agent_pos, new_action, reward, new_state)
                    self.Game.update_state(tuple(new_state), False)

                episode_reward += reward

                
#                 print("Player's possible moves: ",self.Game.get_moves(self.Game.agent_pos))
#                 print("Player's chosen move",new_state)
#                 print("Player's new position",self.Game.agent_pos)
#                 print("Player's reward for this move",reward)
#                 print("Number of moves made to perform task: ",t)

                # if self.Game.terminal_state is not None and self.Game.agent_pos in self.Game.terminal_state:
                #     print("Player found the target square: ",self.Game.terminal_state)

#                 self.Game.draw()
                
            self.update_terminal_state_returns()

            player_scores.append(episode_reward)

            print("Player finished task in :",t, " moves")
            self.Game.refresh_game()

        print("Player scores for every episode: ",player_scores)
        self.Game.refresh_game()
        self.Game.draw()

In [None]:
reward_matrix = np.asarray([[7,8,9], [4,5,6], [1,2,3]])
Game = Targeting_Game((3,3), (0,0), (2,2), reward_matrix)
QLearnTable = QLearningTable(Game, 0.1, 0.25)
QLearningA = QLearningAgent(Game, QLearnTable)
OPBT = OffPolicyBehaviorTable(Game, QLearningA)
OPBT.Game.draw()

In [None]:
import operator # not using
class OffPolicyBehaviorTable:

  def __init__(self, Game, QLearningAgent):
    self.Game = Game
    self.QLearnA = QLearningAgent
    self.behavior_policy_table = np.ones_like(self.QLearnA.model.matrix) / len(self.Game.moves)
    self.target_policy = self.QLearnA.get_target_policy()
  
  def update(self, stateidx):
    "Updates behavior table with the probability values of a given state index, must update QTable first"
    max_action_idx = np.argmax(self.QLearnA.model.matrix[stateidx])
    for x in range(0,len(self.behavior_policy_table[0])):
      self.behavior_policy_table[stateidx][x] = 0
    self.behavior_policy_table[stateidx][max_action_idx] = 1-self.QLearnA.epsilon

  def getAction(self, stateidx):
    "Returns an action index from a state index based on the behavior table"
    if 1-self.QLearnA.model.epsilon > random.random():
      actionidx = np.argmax(self.behavior_policy_table[stateidx])
    else:
      actionidx = random.randint(0,len(self.behavior_policy_table[0])-1)
    point = tuple(map(operator.add, self.Game.index_to_point(actionidx), self.Game.agent_pos))
    while self.Game.illegal_move(point):
      actionidx = random.randint(0,len(self.behavior_policy_table[0])-1)
      point = tuple(map(operator.add, self.Game.index_to_point(actionidx), self.Game.agent_pos))
    return actionidx
  
  #Note, if we change epsilon to equal zero at the end, getAction will return the
  #actions that should be present in the target policy, or the optimal solution to the problem

In [None]:
def play_game_human(Game, epochs, output=False): #human player can play the game with this function.
  #player = person, or agent

  player_collection_scores = []
  player_targeting_scores = []

  #number of times player plays the game is epochs.

  for i in range(epochs):

    #in each epoch, the player needs to complete the tasks
    #first complete collection task
    #then complete target task

    """
    COLLECTION TASK
    """

    collection_moves_made = 0
    Game.draw()
    while len(Game.remaining_prizes) != 0:
      if output:
        print("Number of moves made to perform collection task: ",collection_moves_made)
        print("Player's position",game.agent_pos)
            

      collection_moves_made += 1

      possible_moves = game.get_moves(game.agent_pos)
      r = input("Play the game by entering w s a or d")
      new_pos=game.agent_pos

      if r == "w":
        new_pos=tuple((game.agent_pos[0]-1,game.agent_pos[1]))
        if new_pos in possible_moves: 
          game.update_agent_pos(new_pos)
      
      if r == "s":
        new_pos=tuple((game.agent_pos[0]+1,game.agent_pos[1]))
        if new_pos in possible_moves: 
          game.update_agent_pos(new_pos)
      
      if r == "d":
        new_pos=tuple((game.agent_pos[0],game.agent_pos[1]+1))
        if new_pos in possible_moves: 
          game.update_agent_pos(new_pos)

      if r == "a":
        new_pos=tuple((game.agent_pos[0],game.agent_pos[1]-1))
        if new_pos in possible_moves: 
          game.update_agent_pos(new_pos)
      
      if output:
        print("Number of moves made to perform collection task: ",collection_moves_made)
        print("Player's possible moves: ",possible_moves)
        print("Player's chosen move", new_pos)
        print("Player's new position",game.agent_pos)

      if Game.agent_pos in Game.remaining_prizes:
        print("Removing prize: ",Game.agent_pos)
        Game.remove_prize(Game.agent_pos)
        print("Remaining prizes: ", Game.remaining_prizes)

      Game.draw()

    player_collection_scores.append(collection_moves_made)

    print("Player finished collection task in :",collection_moves_made, " moves")

    """
    TARGET TASK
    """

    Game.update_agent_pos(Game.start)
    targeting_moves_made = 0
    Game.draw()
    while Game.agent_pos != Game.stop:
      if output:
        print("Number of moves made to perform targeting task: ",targeting_moves_made)
        print("Player's position",game.agent_pos)
            

      targeting_moves_made += 1

      possible_moves = game.get_moves(game.agent_pos)
      r = input("Play the game by entering w s a or d")
      new_pos=game.agent_pos

      if r == "w":
        new_pos=tuple((game.agent_pos[0]-1,game.agent_pos[1]))
        if new_pos in possible_moves: 
          game.update_agent_pos(new_pos)
      
      if r == "s":
        new_pos=tuple((game.agent_pos[0]+1,game.agent_pos[1]))
        if new_pos in possible_moves: 
          game.update_agent_pos(new_pos)
      
      if r == "d":
        new_pos=tuple((game.agent_pos[0],game.agent_pos[1]+1))
        if new_pos in possible_moves: 
          game.update_agent_pos(new_pos)

      if r == "a":
        new_pos=tuple((game.agent_pos[0],game.agent_pos[1]-1))
        if new_pos in possible_moves: 
          game.update_agent_pos(new_pos)

      if Game.agent_pos in Game.stop:
        print("Player found the target square: ",Game.stop)

      Game.draw()

    player_targeting_scores.append(targeting_moves_made)

    print("Player finished targeting task in :",targeting_moves_made, " moves")
    Game.refresh_game()

  Game.refresh_game()
  Game.draw()

In [None]:
def epsilonFunc(epsilon, epsilonEnd, epochs, epochNumber):
  #return epsilon * 0.9647
  return (epsilon-epsilonEnd)*((epochs - epochNumber)/epochs) + epsilonEnd

In [None]:
c = [(1,2),(3,4),(5,6)]



n_runs = 30
n_accuracies = int(6000/500)
a = np.zeros((n_accuracies,n_runs))
b = np.zeros((n_accuracies,n_runs))
for n in range(n_runs):
    
    #task_accuracies is c = run_network
    for i,(ai,bi) in enumerate(c):
        a[i,n] = ai
        b[i,n] = bi
    
print(a)
print(b)

#we want mean and std of each row for each task
a_std = []
a_mean = []
for row in range(len(a)):
    a_mean.append(np.mean(a[row,:]))
    a_std.append(np.std(a[row,:]))
    
b_std = []
b_mean = []
for row in range(len(b)):
    b_mean.append(np.mean(b[row,:]))
    b_std(np.std(b[row,:]))

In [None]:
6000/500 + 1