# reinforcement learning



In [0]:
# following the link : 
# https://github.com/adeshpande3/ReinforcementLearning
# Got the practice problem from here: http://i.stack.imgur.com/JHdT2.png

In [7]:
import numpy as np

# This program is to create an RL agent and see if it can learn to perform 
# correctly in a simple environment. The environment will be a 1-D path
# that has a reward at one end of the path. This program uses the Monte Carlo
# technique for adjusting the value function. 

# Monte Carlo is different from
# TD in that you update the value function after each episode, not after 
# each move. This is because we need to observe the total reward until
# the episode finishes, in order to make the value function update. 

# +1 reward at the right, 0 reward everywhere else
# If agent reaches either end, the episode terminates
path = np.array([0,0,0,0,0,0,1])

class Agent:
	def __init__(self):
		self.valueFunction = np.array([0.5,0.5,0.5,0.5,0.5,0.5,0.5]) #Value function at every position is the same, initially
		self.curLoc = 3 #Starting index is at the middle
		self.alphaRate = .1

		# V(s) = V(s) + alpha*(G(t) - V(s))
	def simulateEpisode(self):
		positionsVisited = []
		while True:
			positionsVisited.append(self.curLoc)
			probability = np.random.rand()
			if (probability >= .5): # Go Right
				self.curLoc += 1;
				if (self.curLoc == 6):
					# You get a +1 reward if you reach the rightmost space
					self.updateValueFunction(positionsVisited, 1)
					return	
			else: # Go Left
				self.curLoc -= 1;
				if (self.curLoc == 0):
					# You don't get any reward if you reach the leftmost space
					self.updateValueFunction(positionsVisited, 0)
					return

	def updateValueFunction(self, positions, reward):
		if (reward == 1):
			for pos in positions:
				self.valueFunction[pos] += self.alphaRate*(1 - self.valueFunction[pos])
		else:
			for pos in positions:
				self.valueFunction[pos] += self.alphaRate*(-self.valueFunction[pos])

numEpisodes = 1000
my_agent = Agent()
for i in range(0,numEpisodes):
	my_agent.simulateEpisode()
	my_agent.curLoc = 3

print ("The value function is ", my_agent.valueFunction[1:6])

The value function is  [0.11059653 0.13697192 0.15103669 0.30226373 0.76784331]


# QLearning

In [4]:
import numpy as np
import sys
# This program is to create an RL agent and see if it can learn to perform 
# correctly in a simple environment. The environment will be a 1-D path
# that has a reward at one end of the path. This program uses a Q Learning
# approach to find the optimal policy. 

# +1 reward at the right, 0 reward everywhere else
# If agent reaches either end, the episode terminates
path = np.array([0,0,0,0,0,0,1])

class Agent:
	def __init__(self):
		self.QFunction = np.full((path.size, 2), .5) #Action value function at every position is the same, initially
		self.curLoc = 3 #Starting index is at the middle
		self.alphaRate = .1

		# Q(s,a) = Q(s,a) + alpha*(R(t+1) + argmax(Q) - Q(s,a))
	def simulateEpisode(self):
		while True:
			probability = np.random.rand()
			epsilon = .01
			if (probability <= epsilon): #Take random action (epsilon greedy policy)
				probability = np.random.rand()
				if (probability >= .5):
					action = 1; #Go Right
				else:
					action = 0; #Go Left
			else:
				action = np.argmax(self.QFunction[self.curLoc,:]) #Act greedily with respect to the Q function.

			if (action == 1): # Go Right
				self.curLoc += 1;
				if (self.curLoc == 6):
					# You get a +1 reward if you reach the rightmost space
					self.updateQFunction(self.curLoc, action, 1, -1)
					#self.QFunction[self.curLoc-1, action] += self.alphaRate*
					#(1 + np.argmax(self.QFunction[self.curLoc,:]) - self.QFunction[self.curLoc-1, action])
					return
				else:
					self.updateQFunction(self.curLoc, action, 0, -1)
					#self.QFunction[self.curLoc-1, action] += self.alphaRate*
					#(np.argmax(self.QFunction[self.curLoc,:]) - self.QFunction[self.curLoc-1, action])
					
			else: # Go Left
				self.curLoc -= 1;
				if (self.curLoc == 0):
					# You don't get any reward if you reach the leftmost space
					self.updateQFunction(self.curLoc, action, 0, 1)
					#self.QFunction[self.curLoc+1, action] += self.alphaRate*(-self.valueFunction[self.curLoc+1])
					return
				else:
					self.updateQFunction(self.curLoc, action, 0, 1)
					#self.QFunction[self.curLoc+1, action] += self.alphaRate*(self.valueFunction[self.curLoc] - self.valueFunction[self.curLoc+1]


	def updateQFunction(self, location, action, reward, direction):
		self.QFunction[location + direction, action] += self.alphaRate*(reward + np.argmax(self.QFunction[location,:]) - self.QFunction[location + direction, action])

numEpisodes = 100
my_agent = Agent()
for i in range(1,numEpisodes):
	my_agent.simulateEpisode()
	my_agent.curLoc = 3

print ("The value function is ", my_agent.QFunction[1:6])


The value function is  [[0.2657205  0.295245  ]
 [0.46542604 0.45      ]
 [0.29819745 0.99996842]
 [0.5042305  0.99996946]
 [0.45       0.99997223]]


# RandomPathTDZero

In [6]:
import numpy as np

# This program is to create an RL agent and see if it can learn to perform 
# correctly in a simple environment. The environment will be a 1-D path
# that has a reward at one end of the path. This program uses the TD(0)
# technique for adjusting the value function

# +1 reward at the right, 0 reward everywhere else
# If agent reaches either end, the episode terminates

# Got the practice problem from here: http://i.stack.imgur.com/JHdT2.png
path = np.array([0,0,0,0,0,0,1])

class Agent:
	def __init__(self):
		self.valueFunction = np.array([0.5,0.5,0.5,0.5,0.5,0.5,0.5]) #Value function at every position is the same, initially
		self.curLoc = 3 #Starting index is at the middle
		self.alphaRate = .1

		# V(s) = V(s) + alpha*(R(t+1) + V(s+1) - V(s))
	def simulateEpisode(self):
		while True:
			probability = np.random.rand()
			if (probability >= .5): # Go Right
				self.curLoc += 1;
				if (self.curLoc == 6):
					# You get a +1 reward if you reach the rightmost space
					self.valueFunction[self.curLoc-1] += self.alphaRate*(1 + self.valueFunction[self.curLoc] - self.valueFunction[self.curLoc-1])
					return
				else:
					self.valueFunction[self.curLoc-1] += self.alphaRate*(self.valueFunction[self.curLoc] - self.valueFunction[self.curLoc-1])
					
			else: # Go Left
				self.curLoc -= 1;
				if (self.curLoc == 0):
					# You don't get any reward if you reach the leftmost space
					self.valueFunction[self.curLoc+1] += self.alphaRate*(-self.valueFunction[self.curLoc+1])
					return
				else:
					self.valueFunction[self.curLoc+1] += self.alphaRate*(self.valueFunction[self.curLoc] - self.valueFunction[self.curLoc+1])


numEpisodes = 1000
my_agent = Agent()
for i in range(1,numEpisodes):
	my_agent.simulateEpisode()
	my_agent.curLoc = 3

print ("The value function is ", my_agent.valueFunction[1:6])
v_function = my_agent.valueFunction[1:6]
v_function = v_function/sum(v_function)

The value function is  [0.32216381 0.4963546  0.69080181 1.01838557 1.29670835]
