### Assignment 5, Value Iteration

Your homework must be implemented in this Notebook file. 
You can add as many cells as you want. However, you are not allowed to touch the code below the line "=============".


In [2]:
#The following variables must be used in your program. You can change the location values for testing purposes
stoneLocation=[2, 8]  #[row, column]
positiveTerminalLocation = [0,9,2]  #[row, column, reward value]
negativeTerminalLocation = [1,9, -2]  #[row, column, reward value]
iteration=20
noise=0.15
discount=0.91

In [10]:
import numpy as np
import sys

class Grid:
    def __init__(self,rows,cols,discount,noise,terminalCoordsWithReward,stoneCoords):
        #save dimensions for reference
        self.rows = rows
        self.cols = cols
        
        #set up utility grid
        #nonterminal and terminal cells will have their utility or reward value set accordingly
        #stone cells will have their utility value set to NaN (not a number)
        self.utilityGrid = np.zeros(shape=(rows,cols))
        for coord in terminalCoordsWithReward:
            self.utilityGrid[coord[0]][coord[1]] = coord[2]
        for coord in stoneCoords:
            self.utilityGrid[coord[0]][coord[1]] = np.nan
            
        #set up policy grid
        #terminal cells and stones will have the character 0 as their policy
        self.policyGrid = np.chararray(shape=(rows,cols),unicode=True)
        self.policyGrid[:] = 'n' 
        for coord in terminalCoordsWithReward:
            self.policyGrid[coord[0]][coord[1]] = '-'
        for coord in stoneCoords:
            self.policyGrid[coord[0]][coord[1]] = '-'
        
        #environment factors
        self.discount = discount
        self.noise = noise
        
    #prints a nicely formatted combination of the utility and policy grids
    #uses sys.stdout.write() as opposed to print() to suppress automatic newlines which allows
    #printing cell by cell
    def printUtilityAndPolicy(self):
        formatSpacingString = '{:8}' #format string for consistent spacing
        formatFloatString = '{:04.2f}' #format string for consistent float representation
        for row in range(self.rows):
            #print each cell in the grid
            for col in range(self.cols):
                policy = self.policyGrid[row][col]; utility = self.utilityGrid[row][col]
                if (not policy == '-'): #cell is a regular passing cell
                    utilityString = formatFloatString.format(utility)
                    directionString = '(' + policy + ')'
                    formattedString = formatSpacingString.format(utilityString + directionString)
                    sys.stdout.write(formattedString)
                elif policy == '-': #unpassable or terminal cell
                    if np.isnan(utility): #node is a STONE
                        sys.stdout.write(formatSpacingString.format("STONE"))
                    elif (not utility == np.nan): #cell is a terminal cell
                        sys.stdout.write(formatSpacingString.format(formatFloatString.format(utility)))
            sys.stdout.write("\n")
            sys.stdout.flush()
        return
    
    #returns true if result of action at coordinate goes out of grid or into stone
    #returns false otherwise
    def causesCollision(self, coord, action):
        row = coord[0]; col = coord[1]
        if action == 'n':
            if (row - 1) < 0:
                return True
            elif (self.policyGrid[row-1][col] == '-') and np.isnan(self.utilityGrid[row-1][col]):
                return True
        elif action == 'e':
            if (col + 1) > (self.cols - 1):
                return True
            elif (self.policyGrid[row][col+1] == '-') and np.isnan(self.utilityGrid[row][col+1]):
                return True
        elif action == 's':
            if (row + 1) > (self.rows - 1):
                return True
            elif (self.policyGrid[row+1][col] == '-') and np.isnan(self.utilityGrid[row+1][col]):
                return True
        elif action == 'w':
            if (col - 1) < 0:
                return True
            elif (self.policyGrid[row][col-1] == '-') and np.isnan(self.utilityGrid[row][col-1]):
                return True
        return False
    
    #returns a tuple containing an action policy and utility (policy,utility) based on the current state of the grid
    def bellman(self,coordinate):
        #actions and associated sPrimes with corresponding probabilities
        actions =  {'n': set({('n',1-self.noise),('w',self.noise/2),('e',self.noise/2)}),
                    'e': set({('e',1-self.noise),('n',self.noise/2),('s',self.noise/2)}),
                    's': set({('s',1-self.noise),('e',self.noise/2),('w',self.noise/2)}),
                    'w': set({('w',1-self.noise),('n',self.noise/2),('s',self.noise/2)})}
        row = coordinate[0]; col = coordinate[1]
        candidatePolicy = (None,float("-inf")) #initialize candidate policy with None action and -infinity utility
        for action in actions:
            utilityForAction = 0
            #sum the utility of all sPrimes
            for sPrime in actions[action]:
                if self.causesCollision(coordinate,sPrime[0]):
                    destinationCoord = coordinate
                else:
                    if sPrime[0] == 'n':
                        destinationCoord = (row-1,col)
                    elif sPrime[0] == 'e':
                        destinationCoord = (row,col+1)
                    elif sPrime[0] == 's':
                        destinationCoord = (row+1,col)
                    elif sPrime[0] == 'w':
                        destinationCoord = (row,col-1)
                utilityForAction += self.utilityGrid[destinationCoord[0],destinationCoord[1]]*sPrime[1]
            utilityForAction *= self.discount #apply discount
            if utilityForAction > candidatePolicy[1]:
                candidatePolicy = (action,utilityForAction)
        return candidatePolicy
    
    #applies the Bellman formula to each node of the cells of the grid and keeps track of the result
    #in temporary 2D NumPy arrays
    #updates the policy and utility grids with the results of the application of the Bellman formula
    #optionally prints the result of each iteration
    def determinePolicy(self, iterations = 20, printEachIteration = False):
        for i in range(iterations):
            tempPolicy = np.copy(self.policyGrid); tempUtility = np.copy(self.utilityGrid)
            for row in range(self.rows):
                for col in range(self.cols):
                    if not self.policyGrid[row][col] == '-':
                        bellmanResult = self.bellman((row,col))
                        tempPolicy[row][col] = bellmanResult[0]; tempUtility[row][col] = bellmanResult[1]
            self.policyGrid = tempPolicy; self.utilityGrid = tempUtility
            if printEachIteration:
                print("Iteration #{:d}".format(i+1))
                self.printUtilityAndPolicy()
        return

#required display function
#takes an optional parameter that varies the amount of iterations
#optionally displays the result of each iteration
def start(iterations = 20, printEachIteration = True):
    rows = 10
    cols = 10
    grid = Grid(rows,cols,discount,noise,
            {tuple(positiveTerminalLocation),tuple(negativeTerminalLocation)}, 
            {tuple(stoneLocation)})
    grid.determinePolicy(iterations, printEachIteration)
    if not printEachIteration:
        grid.printUtilityAndPolicy()
    return

You can insert as many cells as you want above
You are not Allowed to modify the code below this line.
# ===============================

In [11]:
#you need to implement print_result function to print out the result according to the required format
start()

Iteration #1
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 1.55(e) 2.00    
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(w) -2.00   
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) STONE   0.00(s) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 
Iteration #2
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 1.20(e) 1.65(e) 2.00    
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 1.06(n) -2.00   
0.

Iteration #11
0.49(e) 0.65(e) 0.82(e) 0.95(e) 1.09(e) 1.23(e) 1.38(e) 1.56(e) 1.76(e) 2.00    
0.32(e) 0.57(e) 0.69(e) 0.86(e) 0.98(e) 1.11(e) 1.24(n) 1.38(n) 1.32(n) -2.00   
0.24(e) 0.34(e) 0.58(e) 0.70(e) 0.86(e) 0.98(n) 1.11(n) 1.22(n) STONE   0.63(s) 
0.00(n) 0.25(e) 0.34(e) 0.58(e) 0.70(n) 0.86(n) 0.97(n) 1.07(n) 0.93(w) 0.79(w) 
0.00(n) 0.00(n) 0.25(e) 0.34(n) 0.58(n) 0.70(n) 0.85(n) 0.92(n) 0.81(n) 0.65(w) 
0.00(n) 0.00(n) 0.00(n) 0.25(n) 0.34(n) 0.58(n) 0.69(n) 0.79(n) 0.65(n) 0.53(n) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.25(n) 0.33(n) 0.56(n) 0.59(n) 0.51(n) 0.32(n) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.24(n) 0.31(n) 0.46(n) 0.28(n) 0.19(n) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.22(n) 0.20(n) 0.18(n) 0.00(n) 
0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.00(n) 0.13(n) 0.00(n) 0.00(n) 
Iteration #12
0.56(e) 0.72(e) 0.84(e) 0.96(e) 1.09(e) 1.23(e) 1.39(e) 1.56(e) 1.76(e) 2.00    
0.49(e) 0.60(e) 0.76(e) 0.87(e) 0.99(e) 1.11(e) 1.24(n) 1.38(n) 1.32(n) -2.00   


# sampel output of one status
## 0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n)
## 0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n)
## 0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),2.00(n),0.00(n),0.00(n),0.00(n)
## 0.00(n),0.00(n),(STONE),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n)
## 0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),-2.00(n),0.00(n),0.00(n),0.00(n)
## 0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n)
## 0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n)
## 0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n)
## 0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n)
## 0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n),0.00(n)

