<p> Aluno: Victor Gabriel Tenório Oliveira </p>

---

https://github.com/pablo-sampaio/rl_facil/tree/main/cap02

In [4]:
from enum import Enum
import numpy as np
import matplotlib.pyplot as plt

import sys

from os import path
sys.path.append( path.dirname( path.dirname( path.abspath("__main__") ) ) )

# Clona para permitir importar o ambiente MultiArmedBanditEnv do repositório da aula
#!git clone https://github.com/pablo-sampaio/rl_facil > /dev/null 2>&1
from util.bandit_envs import MultiArmedBanditEnv

#### Funções que pré-calculam uma lista com os valores de epsilon

In [5]:
def linear_decay(initial_epsilon, final_epsilon, total_steps, should_plot=False) -> list[float]:
    decay_factor = (initial_epsilon - final_epsilon) / total_steps
    epsilon_values = np.linspace(initial_epsilon, final_epsilon, total_steps)

    if should_plot:
        x = -np.linspace(initial_epsilon, final_epsilon, total_steps)
        plt.plot(x, epsilon_values, label='Epsilon Values')
        plt.legend()
        plt.show()

    return list(epsilon_values)

def exponential_decay(initial_epsilon, final_epsilon, total_steps, should_plot=False) -> list[float]:

    SMALL_CONSTANT = 1e-8
    IS_ZERO = final_epsilon == 0.0
    final_epsilon = SMALL_CONSTANT if IS_ZERO else final_epsilon

    # Cria uma progressão geométrica
    # (cada valor é um múltiplo constante do valor anterior)
    epsilon_values = np.geomspace(initial_epsilon, final_epsilon, total_steps)
    # decay_factor = epsilon_values[1] / epsilon_values[0]

    if should_plot:
        x = -np.linspace(initial_epsilon, final_epsilon, total_steps)
        plt.plot(x, epsilon_values, label='Epsilon Values')
        plt.legend()
        plt.show()

    return list(epsilon_values)

# https://stackoverflow.com/questions/55725139/fit-sigmoid-function-s-shape-curve-to-data-using-python
def sigmoid_logistic_decay(initial_epsilon, final_epsilon, total_steps, should_plot=False) -> list[float]:
    k = 10.0
    x = np.linspace(-k, k, total_steps)
    sigmoid_values = 1 / (1 + np.exp(-x))
    # print("sigmoid values -> ", sigmoid_values)
    epsilon_values = initial_epsilon + (final_epsilon - initial_epsilon) * sigmoid_values

    if should_plot:
        plt.plot(x, sigmoid_values, label='Sigmoid Values')
        plt.plot(x, epsilon_values, label='Epsilon Values')
        plt.legend()
        plt.show()

    return list(epsilon_values)


In [6]:
initial_epsilon = 1.0
final_epsilon = 0.0
total_steps = 10000

should_plot = False

# teste
linear_epsilon_values = linear_decay(initial_epsilon, final_epsilon, total_steps, should_plot)
exponential_epsilon_values = exponential_decay(initial_epsilon, final_epsilon, total_steps, should_plot)
sigmoid_logistic_epsilon_values = sigmoid_logistic_decay(initial_epsilon, final_epsilon, total_steps, should_plot)

print("=--------=")
print(linear_epsilon_values)
print("=--------=")
print(exponential_epsilon_values)
print("=--------=")
print(sigmoid_logistic_epsilon_values)
# print(sigmoid_logistic_epsilon_values[0])
# print(sigmoid_logistic_epsilon_values[-1])
print("=--------=")

=--------=
[1.0, 0.9998999899989999, 0.9997999799979999, 0.9996999699969997, 0.9995999599959996, 0.9994999499949995, 0.9993999399939995, 0.9992999299929993, 0.9991999199919992, 0.9990999099909991, 0.998999899989999, 0.9988998899889989, 0.9987998799879988, 0.9986998699869987, 0.9985998599859987, 0.9984998499849985, 0.9983998399839984, 0.9982998299829983, 0.9981998199819982, 0.9980998099809981, 0.997999799979998, 0.9978997899789979, 0.9977997799779978, 0.9976997699769977, 0.9975997599759976, 0.9974997499749975, 0.9973997399739974, 0.9972997299729973, 0.9971997199719972, 0.9970997099709971, 0.996999699969997, 0.9968996899689969, 0.9967996799679968, 0.9966996699669967, 0.9965996599659966, 0.9964996499649965, 0.9963996399639964, 0.9962996299629963, 0.9961996199619962, 0.9960996099609961, 0.995999599959996, 0.9958995899589959, 0.9957995799579958, 0.9956995699569957, 0.9955995599559956, 0.9954995499549955, 0.9953995399539954, 0.9952995299529953, 0.9951995199519952, 0.995099509950995, 0.994999

#### Declara um tipo e um mapeamento para deixar o algoritmo mais fácil de ler e branchless

In [7]:
class DecayType(Enum):
    LINEAR = 'linear'                     # DecayType.LINEAR
    EXPONENTIAL = 'exponential'           # DecayType.EXPONENTIAL
    SIGMOID_LOGISTIC = 'sigmoid-logistic' # DecayType.SIGMOID_LOGISTIC

ALL_DECAY_TYPES = list( DecayType.__members__.values() )

decay_functions = {
    # Name -> Function
    DecayType.LINEAR: linear_decay,
    DecayType.EXPONENTIAL: exponential_decay,
    DecayType.SIGMOID_LOGISTIC: sigmoid_logistic_decay,
}

#### Implementação do decaying_epsilon_greedy

In [8]:
def decaying_epsilon_greedy(
        env: MultiArmedBanditEnv,
        total_steps: int,
        initial_epsilon: float = 1.0,
        final_epsilon: float = 0.0,
        decay_type: DecayType = DecayType.LINEAR
        ):
    assert (final_epsilon <= initial_epsilon)

    NUM_ACITONS = env.get_num_actions()

    # Arrays declaration
    Q = [0.0] * NUM_ACITONS                 # Calculated value of each action
    reward_per_step = [None] * total_steps  # Reward per step
    action_count  = [0] * NUM_ACITONS       # Count of times each action was taken

    # Prepare decay
    calculate_epsilons_function = decay_functions[decay_type]
    epsilon_values = calculate_epsilons_function(initial_epsilon, final_epsilon, total_steps)

    env.reset()
    for i in range(total_steps):
        current_epsilon = epsilon_values[i]
        a = np.random.randint(NUM_ACITONS) if (np.random.random() <= current_epsilon) else np.argmax(Q)

        # Apply action
        r = env.step(a)

        # Update arrays
        reward_per_step[i] = r
        action_count[a] += 1

        delta = r - Q[a]
        Q[a] += (1/action_count[a]) * delta
        # alternativa equivalente: Q[a] = ((action_count[a]-1)*Q[a] + r) / action_count[a]

    return reward_per_step, Q

In [9]:
if __name__ == '__main__':
    BANDIT_PROBABILITIES = [0.2, 0.5, 0.75]
    env = MultiArmedBanditEnv(BANDIT_PROBABILITIES)

    for epsilon in [1.0, 0.1, 0.01]:
        # print(f"=-------= [initial_epsilon = {epsilon}] =-------=")
        for decay_type in ALL_DECAY_TYPES:
            # print(f"=-------= [decay_type = {decay_type}] =-------=")
            for test in range(3):
                rewards, _ = decaying_epsilon_greedy(
                        env,
                        total_steps = 10000,
                        initial_epsilon = epsilon,
                        final_epsilon = 0.0,
                        decay_type = decay_type
                    )
                print(f"decaying_epsilon_greedy ({epsilon}, {decay_type}) - soma de recompensas: {sum(rewards)}")
            print("", end="\n")
            # print("=-------= [end of decay_type] =-------=")
        # print("=-------= [end of initial_epsilon] =-------=")

decaying_epsilon_greedy (1.0, DecayType.LINEAR) - soma de recompensas: 6199.0
decaying_epsilon_greedy (1.0, DecayType.LINEAR) - soma de recompensas: 6181.0
decaying_epsilon_greedy (1.0, DecayType.LINEAR) - soma de recompensas: 6220.0

decaying_epsilon_greedy (1.0, DecayType.EXPONENTIAL) - soma de recompensas: 7326.0
decaying_epsilon_greedy (1.0, DecayType.EXPONENTIAL) - soma de recompensas: 7300.0
decaying_epsilon_greedy (1.0, DecayType.EXPONENTIAL) - soma de recompensas: 7393.0

decaying_epsilon_greedy (1.0, DecayType.SIGMOID_LOGISTIC) - soma de recompensas: 6250.0
decaying_epsilon_greedy (1.0, DecayType.SIGMOID_LOGISTIC) - soma de recompensas: 6227.0
decaying_epsilon_greedy (1.0, DecayType.SIGMOID_LOGISTIC) - soma de recompensas: 6234.0

decaying_epsilon_greedy (0.1, DecayType.LINEAR) - soma de recompensas: 7388.0
decaying_epsilon_greedy (0.1, DecayType.LINEAR) - soma de recompensas: 7224.0
decaying_epsilon_greedy (0.1, DecayType.LINEAR) - soma de recompensas: 7323.0

decaying_epsilo