<a href="https://colab.research.google.com/github/ozakiryota/cart_pole/blob/main/cartpole_qlearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cart-Pole


Install OpenAI Gym (https://gym.openai.com)

In [1]:
!pip install gym



Install the packages for visualizing Gym

In [2]:
!apt update
!apt install xvfb
!pip install pyvirtualdisplay

[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:6 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Ign:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports I

## Q-learning

Import Gym

In [3]:
import gym

Import the packages for visualizing Gym

In [4]:
import base64
import io
from gym.wrappers import Monitor
from IPython import display
from pyvirtualdisplay import Display

Import required packages

In [5]:
import numpy as np

Brain class

In [6]:
class Brain:
	def __init__(self, num_states, list_state_range, list_state_reso, num_actions, gamma, r, lr):
		self.num_states = num_states
		self.list_state_range = list_state_range
		self.list_state_reso = list_state_reso
		self.num_actions = num_actions

		self.eps = 1.0  # for epsilon greedy algorithm
		self.gamma = gamma
		self.r = r
		self.lr = lr

		self.q_table = np.random.rand(np.prod(list_state_reso), num_actions)

	def bins(self, clip_min, clip_max, num):
		return np.linspace(clip_min, clip_max, num + 1)[1:-1]
 
	def getStateIndex(self, observation):
		list_index = []
		for i in range(self.num_states):
			index = np.digitize(observation[i], bins=self.bins(self.list_state_range[i][0], self.list_state_range[i][1], self.list_state_reso[i]))
			list_index.append(index)
		return sum([index*int(np.prod(self.list_state_reso[:i])) for i, index in enumerate(list_index)])

	def updateQtable(self, obs, action, reward, next_obs):
		q = self.q_table[self.getStateIndex(obs), action]
		next_q_max = np.max(self.q_table[self.getStateIndex(next_obs)])
		self.q_table[self.getStateIndex(obs), action] = q + self.lr*(reward + self.gamma*next_q_max - q)

	def getAction(self, obs, is_training):
		if is_training and np.random.rand() < self.eps:
			action = np.random.randint(self.num_actions)
		else:
			action = np.argmax(self.q_table[self.getStateIndex(obs)])
		## update eps
		if is_training and self.eps > 0.1:
			self.eps *= self.r
		return action

Agent class

In [7]:
class Agent:
	def __init__(self, num_states, list_state_range, list_state_reso, num_actions, gamma, r, lr):
		self.brain = Brain(num_states, list_state_range, list_state_reso, num_actions, gamma, r, lr)
 
	def updateQtable(self, obs, action, reward, next_obs):
		self.brain.updateQtable(obs, action, reward, next_obs)
 
	def getAction(self, obs, is_training):
		action = self.brain.getAction(obs, is_training)
		return action

Environment class

In [8]:
class Environment:
	def __init__(self, num_episodes, max_step, gamma, r, lr):
		## parameters
		self.num_episodes = num_episodes
		self.max_step = max_step
		## environment
		self.env = Monitor(gym.make('CartPole-v0'), './videos/', video_callable=(lambda ep: ep % 100 == 0), force=True)
		## agent
		num_states = self.env.observation_space.shape[0]	# position, velocity, angle, angular velocity
		list_state_range = []
		for i in range(num_states):
			list_state_range.append([self.env.observation_space.low[i], self.env.observation_space.high[i]])
		list_state_range[1] = [-3.0, 3.0]
		list_state_range[3] = [-0.5, 0.5]
		print("list_state_range = ", list_state_range)
		list_state_reso = [4, 4, 6, 6]
		num_actions = self.env.action_space.n

		self.agent = Agent(num_states, list_state_range, list_state_reso, num_actions, gamma, r, lr)
 
	def train(self):
		num_completed_episodes = 0
  
		for episode in range(self.num_episodes):
			obs = self.env.reset()
			episode_reward = 0
 
			for step in range(self.max_step):
				## get action
				action = self.agent.getAction(obs, is_training=True)
				## observe next step
				next_obs, _, is_done, _ = self.env.step(action)
				## get reward
				if is_done:
					if step < max_step - 1:
						reward = -100
					else:
						reward = 1
						num_completed_episodes += 1
				else:
					reward = 1
				episode_reward += reward
				## update
				self.agent.updateQtable(obs, action, reward, next_obs)
				## to next step
				obs = next_obs

				if is_done:
					print('{0} Episode: Finished after {1} time steps with reward {2}'.format(episode, step+1, episode_reward))
					break
		print("num_completed_episodes = ", num_completed_episodes)

	def evaluate(self):
		obs = self.env.reset()
		
		for step in range(self.max_step):
			## get action
			action = self.agent.getAction(obs, is_training=False)
			## observe next step
			next_obs, _, is_done, _ = self.env.step(action)
			## to next step
			obs = next_obs

			if is_done:
				print('Evaluation: Finished after {} time steps'.format(step+1))
				break

Prepare showing videos of the restults

In [9]:
def show_video(env):
	env.reset()
	for frame in env.videos:
		print("frame = ", frame)
		video = io.open(frame[0], 'r+b').read()
		encoded = base64.b64encode(video)

		display.display(display.HTML(data="""
			<video alt="" controls>
			<source src="data:video/mp4;base64,{0}" type="video/mp4" />
			</video>
			""".format(encoded.decode('ascii')))
		)

In [None]:
## display
virtual_display = Display()
virtual_display.start()

## parameters
num_episodes = 500
max_step = 200
gamma = 0.9
r = 0.99
lr = 0.5

## run
cartpole_env = Environment(num_episodes, max_step, gamma, r, lr)
cartpole_env.train()
cartpole_env.evaluate()
show_video(cartpole_env.env)

list_state_range =  [[-4.8, 4.8], [-3.0, 3.0], [-0.41887903, 0.41887903], [-0.5, 0.5]]
0 Episode: Finished after 17 time steps with reward -84
1 Episode: Finished after 21 time steps with reward -80
2 Episode: Finished after 16 time steps with reward -85
3 Episode: Finished after 45 time steps with reward -56
4 Episode: Finished after 29 time steps with reward -72
5 Episode: Finished after 28 time steps with reward -73
6 Episode: Finished after 18 time steps with reward -83
7 Episode: Finished after 30 time steps with reward -71


## References
- [minnano_rl/section_2/01_simple_reinforcement_learning.ipynb](https://github.com/yukinaga/minnano_rl/blob/main/section_2/01_simple_reinforcement_learning.ipynb)
- [第10回　CartPole課題をQ学習で制御する](https://book.mynavi.jp/manatee/detail/id=88997)