In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

# SARSA TABLE
class SarsaTable():
    def __init__(self, num_actions, learning_rate, discount_factor, epsilon_greedy):
        self.actions = num_actions  # a list
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon_greedy
        self.temp_q_table = dict()

    # Action 진행
    def choose_action(self, observation):
        observation = str(observation)

        if observation in self.temp_q_table:
            pass
        else:
            self.temp_q_table[observation] = [0] * self.actions

        if np.random.rand() > self.epsilon:
            temp_state_action = self.temp_q_table[observation]
            actions = temp_state_action == np.max(temp_state_action)
            temp_action = np.random.choice([i for i in range(len(actions)) if actions[i] == True])
        else:
            temp_action = np.random.choice(self.actions)

        self.epsilon = self.epsilon - 1e-3

        return temp_action

    # Learning 진행
    def learn(self, s, a, r, s_, a_, done):
        s = str(s)
        s_ = str(s_)
        temp_q_pred = self.temp_q_table[s][a]

        if not done:
            temp_q_target = r + self.gamma * self.temp_q_table[s_][a_]  # next state is not terminal
        else:
            temp_q_target = r

        self.temp_q_table[s][a] += self.lr * (temp_q_target - temp_q_pred)  # update


# Q TABLE
class QTable():
    def __init__(self, num_actions, learning_rate, discount_factor, epsilon_greedy):
        self.actions = num_actions  # a list
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon_greedy
        self.temp_q_table = dict()

    # Action 진행
    def choose_action(self, observation):
        observation = str(observation)

        if observation in self.temp_q_table:
            pass
        else:
            self.temp_q_table[observation] = [0] * self.actions

        if np.random.rand() > self.epsilon:
            temp_state_action = self.temp_q_table[observation]
            actions = temp_state_action == np.max(temp_state_action)
            temp_action = np.random.choice([i for i in range(len(actions)) if actions[i] == True])
        else:
            temp_action = np.random.choice(self.actions)

        self.epsilon = self.epsilon - 1e-3

        return temp_action

    # Learning 진행
    def learn(self, s, a, r, s_, a_, done):
        s = str(s)
        s_ = str(s_)
        temp_q_pred = self.temp_q_table[s][a]

        if not done:
            temp_q_target = r + self.gamma * np.max(self.temp_q_table[s_])  # next state is not terminal
        else:
            temp_q_target = r

        self.temp_q_table[s][a] += self.lr * (temp_q_target - temp_q_pred)  # update

# DeepSarsa 구현
class DeepSarsa():
    def __init__(self, num_actions, learning_rate, discount_factor, epsilon_greedy):
        self.actions = num_actions  # a list
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon_greedy

        # Initialize a neural network
        self.prepare_brain()

    def choose_action(self, observation):
        observation = np.array([observation])
        predict_present_q = self.model.predict(observation)

        print("현재 관측은 {} 입니다.".format(observation))
        print("관측에 따른 Q value 결과는 {} 입니다.".format(predict_present_q))

        if np.random.rand() > self.epsilon:
            temp_state_action = np.argmax(predict_present_q)
            action = temp_state_action
            print("현재 관측 결과 중, 가장 큰 값은 {} 입니다.".format(temp_state_action))
        else:
            action = np.random.choice(self.actions)
            print("랜덤 결과는, {} 입니다.".format(action))

        self.epsilon = self.epsilon - 1e-4
        print("입실론 값", self.epsilon)

        return action

    # Initialize a neural network
    def prepare_brain(self):
        self.model = tf.keras.Sequential([
            layers.Dense(128, activation='relu'),
            layers.Dense(128, activation='relu'),
            layers.Dense(4, activation='linear')])

        self.optimizer = tf.keras.optimizers.Adam(self.lr)
        self.model.compile(optimizer=self.optimizer, loss='mse', metrics=['mae'])

    # Train neural network
    def learn(self, s, a, r, s_, a_, done):
        s = np.array([s])
        s_ = np.array([s_])
        present_q = self.model.predict(s)
        predict_next_q = self.model.predict(s_)

        if not done:
            temp_q_target = r + self.gamma * predict_next_q[0][a_]
        else:
            temp_q_target = r

        # a 빼고 전부, 그대로 사용
        origin_label = np.array((tf.ones(4) - tf.one_hot(a, 4)) * present_q)
        changed_label = np.array([tf.one_hot(a, 4) * temp_q_target])
        label = origin_label + changed_label

        # 한번 학습 시킴
        self.model.fit(s, label)
