<a href="https://colab.research.google.com/github/pkshcherbakov/Data_Science/blob/main/RL/CrossEntropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.0 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0


In [None]:
import gym
import numpy.random

import numpy as np
import random
import time
import optuna
import pandas as pd
from optuna.visualization import plot_optimization_history

action_n = 6
state_n = 500
env = gym.make('Taxi-v3')


class RandomAgent():
	def __init__(self, action_n):
		self.action_n = action_n

	def get_action(self, state):
		action = np.random.randint(self.action_n)
		return action


class CrossEntropyAgent():
	def __init__(self, state_n, action_n):
		self.state_n = state_n
		self.action_n = action_n
		self.model = np.ones((state_n, action_n)) / action_n

	def get_action(self, state):
		action = np.random.choice(np.arange(self.action_n), p=self.model[state])
		return int(action)

	def fit(self, elite_trajectories):
		new_model = np.zeros((self.state_n, self.action_n))
		for trajectory in elite_trajectories:
			for state, action in zip(trajectory['states'], trajectory['actions']):
				new_model[state][action] += 1

		for state in range(self.state_n):
			if np.sum(new_model[state]) > 0:
				new_model[state] /= np.sum(new_model[state])
			else:
				new_model[state] = self.model[state].copy()

		self.model = new_model


def get_trajectory(env, agent, max_len=200, visualize=False):
	trajectory = {'states': [], 'actions': [], 'rewards': []}

	state = env.reset()

	for _ in range(max_len):
		trajectory['states'].append(state)

		action = agent.get_action(state)
		trajectory['actions'].append(action)

		next_state, reward, done, _ = env.step(action)
		trajectory['rewards'].append(reward)

		state = next_state

		if visualize:
			time.sleep(0.5)
			env.render()

		if done:
			break

	return trajectory


def train_agents(q_param, iteration_n, trajectory_n):
	agent = CrossEntropyAgent(state_n, action_n)

	for iteration in range(iteration_n):
		# policy evaluation
		trajectories = [get_trajectory(env, agent) for _ in range(trajectory_n)]
		total_reward = [np.sum(trajectory['rewards']) for trajectory in trajectories]
		mean_total_reward = np.mean(total_reward)

		# policy improvement
		quantile = np.quantile(total_reward, q_param)
		elite_trajectories = []

		for trajectory in trajectories:
			total_reward = np.sum(trajectory['rewards'])
			if total_reward > quantile:
				elite_trajectories.append(trajectory)

		agent.fit(elite_trajectories)
	return mean_total_reward


def objective(trial):
	q_param = trial.suggest_float('q_param', 0.45, 0.65)
	iteration_n = trial.suggest_int('iteration_n', 15, 35)
	trajectory_n = trial.suggest_int('trajectory_n', 1500, 4000)
	start_time = time.time()
	mean_total_reward = train_agents(q_param, iteration_n, trajectory_n)
	end_time = time.time()
	execution_time = end_time - start_time

	result_dict = {
		'trial_number': trial.number,
		'q_param': q_param,
		'iteration_n': iteration_n,
		'trajectory_n': trajectory_n,
		'mean_total_reward': mean_total_reward,
		'execution_time': execution_time
	}
	results_df.loc[len(results_df)] = result_dict
	return mean_total_reward


results_df = pd.DataFrame(
	columns=['trial_number', 'q_param', 'iteration_n', 'trajectory_n', 'mean_total_reward', 'execution_time'])
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_params = study.best_params
best_total_reward = study.best_value
optuna.visualization.plot_slice(study, )
print('Best params:', study.best_params)


  deprecation(
  deprecation(


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_params = study.best_params
best_total_reward = study.best_value

In [2]:
import pandas as pd

# Table data from experiments
result = {
    'trial_number': [12, 17, 15, 1, 18, 16, 6, 2, 13, 8, 11, 4, 19, 7, 5, 9, 3, 14, 10, 0],
    'q_param': [0.5, 0.46, 0.49, 0.52, 0.45, 0.48, 0.49, 0.47, 0.53, 0.63, 0.51, 0.55, 0.47, 0.57, 0.63, 0.63, 0.61, 0.53, 0.51, 0.46],
    'iteration_n': [28, 31, 30, 26, 34, 32, 25, 31, 29, 35, 23, 28, 27, 33, 18, 28, 25, 22, 21, 15],
    'trajectory_n': [2662, 2926, 3197, 3181, 2865, 2401, 2808, 2304, 3381, 3524, 2686, 3783, 2041, 2068, 3181, 3853, 2979, 2546, 1618, 2007],
    'mean_total_reward': [7.97, 7.93, 7.91, 7.9, 7.89, 7.86, 7.85, 7.84, 7.84, 7.84, 7.81, 7.79, 7.78, 7.76, 7.74, 7.72, 7.69, 7.64, 7.6, 4.16],
    'execution_time': [280.98, 334.26, 327.2, 309.3, 347.08, 259.03, 282.91, 261.6, 330.37, 320.75, 268.31, 365.09, 214.81, 204.54, 236.42, 310.18, 242.45, 232.15, 157.35, 198.33]
}

df = pd.DataFrame(result)
df

Unnamed: 0,trial_number,q_param,iteration_n,trajectory_n,mean_total_reward,execution_time
0,12,0.5,28,2662,7.97,280.98
1,17,0.46,31,2926,7.93,334.26
2,15,0.49,30,3197,7.91,327.2
3,1,0.52,26,3181,7.9,309.3
4,18,0.45,34,2865,7.89,347.08
5,16,0.48,32,2401,7.86,259.03
6,6,0.49,25,2808,7.85,282.91
7,2,0.47,31,2304,7.84,261.6
8,13,0.53,29,3381,7.84,330.37
9,8,0.63,35,3524,7.84,320.75
