In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import math

In [25]:
#n-armed Bandit problem

np.random.seed()

banditsReward = np.array([[1,1.5,2,2,1.75],[5,1,1,2,10],[0,0,0,0,0]]) #reward earned with mean=first row and variance=second row
banditsValue = np.random.normal(banditsReward[0,:],banditsReward[1,:]) #For my approach I am going to initial a random set of values based on the reward distribution

#print(banditsValue, np.argmax(banditsValue))

def value(actionValue, actionMean, actionVar, k):
    reward = np.random.normal(actionMean, actionVar)
    k += 1
    actionValue = actionValue + ((1/k)*(reward - actionValue))
    
    #value_k+1 = Value_k + (1/k+1)*(reward_k+1 - value_k)
    return actionValue, k

def greedy(banditsReward, banditsValue, timeSteps):
    
    for i in range(timeSteps):
        hI = np.argmax(banditsValue) #finds the index of the highest valued (greedy) action
        banditsValue[hI], banditsReward[2,hI] = value(banditsValue[hI], banditsReward[0,hI], banditsReward[1,hI], banditsReward[2,hI])
   
    print(banditsReward[2,:], banditsValue) 
    
 
def eGreedy(banditsReward, banditsValue, timeSteps):
    
    eps = 0.1 #epsilon value
    
    for i in range(timeSteps):
        hI = np.argmax(banditsValue) #finds the index of the highest valued (greedy) action
        if np.random.uniform(0.0,1.0) < eps: 
            other = random.choice(list(enumerate(banditsValue)))[0] #picks a random index from the values
            while other == hI: #loop makes sure its not the same index as the best value
                other = random.choice(list(enumerate(banditsValue)))[0]
            hI = other
            banditsValue[hI], banditsReward[2,hI] = value(banditsValue[hI], banditsReward[0,hI], banditsReward[1,hI], banditsReward[2,hI])
        else:
            banditsValue[hI], banditsReward[2,hI] = value(banditsValue[hI], banditsReward[0,hI], banditsReward[1,hI], banditsReward[2,hI])

    print(banditsReward[2,:], banditsValue) 


#come up with a function that averages values over so many trials    
    
eGreedy(banditsReward, banditsValue, 100)



[  2.   1.  30.   7.  60.] [-0.41516281  0.26996689  2.20563575  1.6917657   2.55613141]


In [132]:
#for i in range(banditsReward.shape[1]):
#    mu = banditsReward[0,i]
#    variance = banditsReward[1,i]
#    sigma = math.sqrt(variance)
#    x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
#    plt.plot(x,mlab.normpdf(x, mu, sigma))
#    plt.show()

In [165]:
#Grid problem with greedy approach

np.random.seed() #seed the random generator

reward = np.full((5,10),-1)
reward[3,9] = 100 #intials the rewards grid where all values are -1 except the door which is 100

value = reward.copy() #intially value equals rewards

k = np.zeros((5,10)) #k value for each location in the grid, intially all zeros

location = [np.random.randint(0,reward.shape[0]), np.random.randint(0,reward.shape[1])] #picks a randomn start location

actions = [0,-1,1,-1,1] #0 = stay in place, -1=north/west, 1=south/east

#input for this function is single value from arrays
def gridValue(value, reward, k):
    #print(value,reward,k)
    k += 1
    value = value + ((1/k)*(reward + value))
    #print(value,k)
    return value, k

def all_same(items):
    return all(x == items[0] for x in items)

def findTempValues():
    tempValues = []        
    #for loop finds values of all possible actions
    for i in range(len(actions)):
        if i == 0:
            tempValue, kTemp = gridValue(value[location[0],location[1]],reward[location[0],location[1]],k[location[0],location[1]])
            tempValues.append(tempValue)
            if tempValue == 100:
                break   
        elif i == 1: #north
            if location[0] != 0: #check value north of current block, makes sure it cant go outside bounds
                tempValue, kTemp = gridValue(value[location[0]+actions[i],location[1]],reward[location[0]+actions[i],location[1]],k[location[0]+actions[i],location[1]])
                tempValues.append(tempValue)
            else:
                tempValue = -1000 #makes it impossible to select an impossible action
                tempValues.append(tempValue)
        elif i == 2: #south
            if location[0] != reward.shape[0]-1: #check value north of current block
                tempValue, kTemp = gridValue(value[location[0]+actions[i],location[1]],reward[location[0]+actions[i],location[1]],k[location[0]+actions[i],location[1]])
                tempValues.append(tempValue)
            else:
                tempValue = -1000 #makes it impossible to select an impossible action
                tempValues.append(tempValue)
        elif i == 3: #west
            if location[1] != 0:
                tempValue, kTemp = gridValue(value[location[0],location[1]+actions[i]],reward[location[0],location[1]+actions[i]],k[location[0],location[1]+actions[i]])
                tempValues.append(tempValue)
            else:
                tempValue = -1000 #makes it impossible to select an impossible action
                tempValues.append(tempValue)
        elif i == 4: #east
            if location[1] != reward.shape[1]-1:
                tempValue, kTemp = gridValue(value[location[0],location[1]+actions[i]],reward[location[0],location[1]+actions[i]],k[location[0],location[1]+actions[i]])
                tempValues.append(tempValue)
            else:
                tempValue = -1000 #makes it impossible to select an impossible action
                tempValues.append(tempValue)

    return tempValues
                
 
def gridGreedy():
        
    for j in range(20):

        tempValues = findTempValues()
        #value update from last move, put here so it updates from the initial starting position

        if all_same(tempValues): #checks to see if equal probablity of actions, important in the beggining
            move = np.random.randint(0,len(tempValues))
            if move == 1 or move == 2:
                location[0] = location[0]+actions[move]
            elif move == 3 or move == 4:
                location[1] = location[1]+actions[move]
            value[location[0],location[1]], k[location[0],location[1]] = gridValue(value[location[0],location[1]],reward[location[0],location[1]],k[location[0],location[1]]) 
            if reward[location[0],location[1]] == 100:
                print("Found the Door!")
                print(value, j) 
                break
        else:          
            move = np.argmax(tempValues) #picks the path with the highes value, ie greedy path. index returned
            if move == 1 or move == 2:
                location[0] = location[0]+actions[move]
            elif move == 3 or move == 4:
                location[1] = location[1]+actions[move]
            value[location[0],location[1]], k[location[0],location[1]] = gridValue(value[location[0],location[1]],reward[location[0],location[1]],k[location[0],location[1]]) 
            if reward[location[0],location[1]] == 100:
                print("Found the Door!")
                print(value, j) 
                break

#e-greedy grid algorithm
def gridEgreedy():
    
    eps = 0.1 #epsilon value
    
    for j in range(100):

        tempValues = findTempValues()
        #value update from last move, put here so it updates from the initial starting position
        #takes a random move if p<eps, takes best move if p>eps
        if np.random.uniform(0.0,1.0) < eps:  #checks to see if equal probablity of actions, important in the beggining
            #find a legal move
            move = np.random.randint(0,len(tempValues))
            while tempValues[move] == -1000:
                move = np.random.randint(0,len(tempValues))
            if move == 1 or move == 2:
                location[0] = location[0]+actions[move]
            elif move == 3 or move == 4:
                location[1] = location[1]+actions[move]
            value[location[0],location[1]], k[location[0],location[1]] = gridValue(value[location[0],location[1]],reward[location[0],location[1]],k[location[0],location[1]]) 
            if reward[location[0],location[1]] == 100:
                print("Found the Door!")
                print(value, j) 
                break
        else:         
            move = np.argmax(tempValues) #picks the path with the highes value, ie greedy path. index returned
            if move == 1 or move == 2:
                location[0] = location[0]+actions[move]
            elif move == 3 or move == 4:
                location[1] = location[1]+actions[move]
            value[location[0],location[1]], k[location[0],location[1]] = gridValue(value[location[0],location[1]],reward[location[0],location[1]],k[location[0],location[1]]) 
            if reward[location[0],location[1]] == 100:
                print("Found the Door!")
                print(value, j) 
                break 
                
for i in range(1):
    location = [np.random.randint(0,reward.shape[0]), np.random.randint(0,reward.shape[1])]
    gridEgreedy()        

Found the Door!
[[ -9  -7  -7  -5  -5  -3  -3  -3  -3  -1]
 [ -9  -7  -7  -5  -5  -3  -3  -3  -3  -1]
 [ -9  -7  -7  -7  -5  -3  -3  -3  -3  -3]
 [ -9  -7  -7  -5  -5  -3  -3  -1  -1 300]
 [ -7  -7  -5  -5  -3  -3  -5  -1  -1  -1]] 86


In [None]:
#Grid problem with Q learning approach

np.random.seed() #seed the random generator

reward = np.full((5,10),-1)
reward[3,9] = 100 #intials the rewards grid where all values are -1 except the door which is 100

value = reward.copy() #intially value equals rewards

k = np.zeros((5,10)) #k value for each location in the grid, intially all zeros

location = [np.random.randint(0,reward.shape[0]), np.random.randint(0,reward.shape[1])] #picks a randomn start location

actions = [0,-1,1,-1,1] #0 = stay in place, -1=north/west, 1=south/east

#input for this function is single value from arrays
def gridValue(value, reward, k):
    #print(value,reward,k)
    k += 1
    value = value + ((1/k)*(reward + value))
    #print(value,k)
    return value, k

def all_same(items):
    return all(x == items[0] for x in items)

def findTempValues():
    tempValues = []        
    #for loop finds values of all possible actions
    for i in range(len(actions)):
        if i == 0:
            tempValue, kTemp = gridValue(value[location[0],location[1]],reward[location[0],location[1]],k[location[0],location[1]])
            tempValues.append(tempValue)
            if tempValue == 100:
                break   
        elif i == 1: #north
            if location[0] != 0: #check value north of current block, makes sure it cant go outside bounds
                tempValue, kTemp = gridValue(value[location[0]+actions[i],location[1]],reward[location[0]+actions[i],location[1]],k[location[0]+actions[i],location[1]])
                tempValues.append(tempValue)
            else:
                tempValue = -1000 #makes it impossible to select an impossible action
                tempValues.append(tempValue)
        elif i == 2: #south
            if location[0] != reward.shape[0]-1: #check value north of current block
                tempValue, kTemp = gridValue(value[location[0]+actions[i],location[1]],reward[location[0]+actions[i],location[1]],k[location[0]+actions[i],location[1]])
                tempValues.append(tempValue)
            else:
                tempValue = -1000 #makes it impossible to select an impossible action
                tempValues.append(tempValue)
        elif i == 3: #west
            if location[1] != 0:
                tempValue, kTemp = gridValue(value[location[0],location[1]+actions[i]],reward[location[0],location[1]+actions[i]],k[location[0],location[1]+actions[i]])
                tempValues.append(tempValue)
            else:
                tempValue = -1000 #makes it impossible to select an impossible action
                tempValues.append(tempValue)
        elif i == 4: #east
            if location[1] != reward.shape[1]-1:
                tempValue, kTemp = gridValue(value[location[0],location[1]+actions[i]],reward[location[0],location[1]+actions[i]],k[location[0],location[1]+actions[i]])
                tempValues.append(tempValue)
            else:
                tempValue = -1000 #makes it impossible to select an impossible action
                tempValues.append(tempValue)

    return tempValues