<a href="https://colab.research.google.com/github/pheeree/projectRL_2023/blob/main/%08env_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install GYM by OpenAI

In [12]:
pip install gym



# 2. Import Library

In [13]:
import gym
from gym import spaces
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd

# 3. Modeling RL
## (1) Set Env.

배치로 부터 나온 리워드를 바탕으로 다음 환경의 리워드에 제한을 추가하는 동적 환경을 구성한고 액션에 따른 리워드를 설정함

In [11]:
# 동적 환경 클래스 정의 - 2
class DynamicEnv(gym.Env):
    def __init__(self):
        super(DynamicEnv, self).__init__()
        self.observation_space = spaces.Discrete(4)  # 네 개의 상태 (0, 1, 2, 3)
        self.action_space = spaces.Discrete(2)  # 두 가지 선택 (0: 현상유지, 1: 변화)
        self.stage = 0
        self.action_prob = 0.3 # 초기 '1' 선택 확률
        self.reward_scale = 1.0  # 보상 스케일 (1을 선택할 때의 보상 크기)

    def step(self, action):
        reward = 0
        done = False

        # 마지막 스테이지 포함 모든 스테이지에서 같은 보상 체계 적용
        if action == 1:
            reward = (10 / self.reward_scale) if random.random() < self.action_prob else (-10 + self.reward_scale)
        else:
            reward = (5 / self.reward_scale) if random.random() < (1 - self.action_prob) else (-5 + self.reward_scale)

        self.stage += 1

        if self.stage == 3:  # 스테이지 3에서는 항상 종료
            done = True

        return self.stage, reward, done, {}

    def reset(self):
        self.stage = 0
        return self.stage

    def update_reward_scale(self, total_reward_batch):
        # 리워드 스케일을 이전 배치의 총 보상에 따라 조정
        self.reward_scale = np.sqrt(abs(total_reward_batch / batch_size)) # 적절한 범위 내에서 조정

## (2) Set Agent

In [14]:
# 에이전트 클래스 정의
class ReinforcementAgent:
    def __init__(self, action_space, state_space, learning_rate=0.1, gamma=0.9, epsilon=0.1):
        self.action_space = action_space
        self.state_space = state_space
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = np.zeros((state_space, action_space))
        self.recent_reward = 0

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.action_space - 1)
        else:
            if self.recent_reward > 0:
                # 양수 보상일 때 랜덤 확률로 액션 0 선택 강화
                return 0 if random.uniform(0.4, 1) > 0.4 else 1
            elif self.recent_reward < 0:
                # 음수 보상일 때 랜덤 확률로 액션 1 선택 강화
                return 1 if random.uniform(0.4, 1) > 0.4 else 0
            else:
                return random.randint(0, self.action_space - 1)

    def learn_sarsa(self, state, action, reward, next_state, next_action):
        predict = self.Q[state, action]
        target = reward + self.gamma * self.Q[next_state, next_action]
        self.Q[state, action] += self.learning_rate * (target - predict)

    def learn_q_learning(self, state, action, reward, next_state):
        predict = self.Q[state, action]
        target = reward + self.gamma * np.max(self.Q[next_state, :])
        self.Q[state, action] += self.learning_rate * (target - predict)

## (3) 학습
### a. SARSA

In [15]:


# 환경 및 에이전트 초기화
env = DynamicEnv()
agent = ReinforcementAgent(env.action_space.n, env.observation_space.n)

# 배치 학습 설정 및 실행
num_batches = 1000
batch_size = 1000

for batch in range(num_batches):
    episode_data_sarsa = []
    total_reward_batch = 0

    for episode in range(batch_size):
        state = env.reset()
        episode_reward_sarsa = 0
        episode_actions_sarsa = []
        episode_rewards_sarsa = []  # 각 스테이지별 보상을 저장하기 위한 리스트

        # SARSA 학습
        while True:
            action = agent.choose_action(state)
            episode_actions_sarsa.append(action)
            next_state, reward, done, _ = env.step(action)
            episode_rewards_sarsa.append(reward)  # 스테이지별 보상 추가
            episode_reward_sarsa += reward
            agent.recent_reward = reward
            next_action = agent.choose_action(next_state)
            agent.learn_sarsa(state, action, reward, next_state, next_action)
            state = next_state
            if done:
                break

        episode_data_sarsa.append({
            'Episode': episode + 1,
            'Stage Rewards': episode_rewards_sarsa,
            'Total Reward': episode_reward_sarsa
        })

        total_reward_batch += episode_reward_sarsa

    env.update_reward_scale(total_reward_batch)

    # 데이터 프레임 생성
    df_sarsa = pd.DataFrame(episode_data_sarsa)
    #print(f"Batch {batch + 1} SARSA Episode Data:")
    #print(df_sarsa)
    print(f"Batch {batch + 1} Total Average Reward: {total_reward_batch / batch_size}")

    # 에피소드별 총 보상을 기준으로 데이터 프레임 정렬
    df_sorted = df_sarsa.sort_values(by='Total Reward', ascending=False)

    # 상위 10% 에피소드의 평균 점수 계산
    top_10_avg_reward = df_sorted['Total Reward'].head(int(batch_size*0.1)).mean()

    #print(f"Batch {batch + 1} Top 100 Episodes Average Reward: {top_10_avg_reward}")


Batch 1 Total Average Reward: -1.609
Batch 2 Total Average Reward: -3.2298863788575467
Batch 3 Total Average Reward: -4.473882050757023
Batch 4 Total Average Reward: -5.208926240404278
Batch 5 Total Average Reward: -5.008936390621173
Batch 6 Total Average Reward: -6.129016789970004
Batch 7 Total Average Reward: -5.111849069591287
Batch 8 Total Average Reward: -4.579736399237635
Batch 9 Total Average Reward: -4.460441524304132
Batch 10 Total Average Reward: -4.779030962425977
Batch 11 Total Average Reward: -4.713188932080808
Batch 12 Total Average Reward: -5.532901882641112
Batch 13 Total Average Reward: -4.92687587199279
Batch 14 Total Average Reward: -4.424967707866314
Batch 15 Total Average Reward: -4.743757377924728
Batch 16 Total Average Reward: -4.439556761282552
Batch 17 Total Average Reward: -4.6535706708722
Batch 18 Total Average Reward: -5.251902935894709
Batch 19 Total Average Reward: -5.243917105636825
Batch 20 Total Average Reward: -5.304923321351311
Batch 21 Total Average 

### b. Q-Learning

In [17]:

# 환경 및 에이전트 초기화
env = DynamicEnv()
agent = ReinforcementAgent(env.action_space.n, env.observation_space.n)

# 배치 학습 설정
num_batches = 1000
batch_size = 1000

for batch in range(num_batches):
    episode_data_q_learning = []
    total_reward_batch = 0

    for episode in range(batch_size):
        state = env.reset()
        episode_reward_q_learning = 0
        episode_actions_q_learning = []
        episode_rewards_q_learning = []  # 각 스테이지별 보상을 저장하기 위한 리스트

        # Q-Learning 학습
        while True:
            action = agent.choose_action(state)
            episode_actions_q_learning.append(action)
            next_state, reward, done, _ = env.step(action)
            episode_rewards_q_learning.append(reward)  # 스테이지별 보상 추가
            episode_reward_q_learning += reward
            agent.recent_reward = reward
            agent.learn_q_learning(state, action, reward, next_state)
            state = next_state
            if done:
                break

        episode_data_q_learning.append({
            'Episode': episode + 1,
            'Stage Rewards': episode_rewards_q_learning,
            'Total Reward': episode_reward_q_learning
        })

        total_reward_batch += episode_reward_q_learning

    env.update_reward_scale(total_reward_batch)

    # Q-Learning 결과 처리
    df_q_learning = pd.DataFrame(episode_data_q_learning)
    batch_avg_reward_q_learning = df_q_learning['Total Reward'].mean()
    #print(f"Batch {batch + 1} Q-Learning Episode Data")
    #print(df_q_learning)
    print(f"Batch {batch + 1} Q-Learning Episode Data : ", total_reward_batch / batch_size)

    # 에피소드별 총 보상을 기준으로 데이터 프레임 정렬
    df_sorted_Q = df_q_learning.sort_values(by='Total Reward', ascending=False)

    # 상위 10% 에피소드의 평균 점수 계산
    top_10_avg_reward_Q = df_sorted_Q['Total Reward'].head(int(batch_size*0.1)).mean()

    #print(f"Batch {batch + 1} Top 100 Episodes Average Reward: {top_10_avg_reward_Q}")


Batch 1 Q-Learning Episode Data :  -2.667
Batch 2 Q-Learning Episode Data :  -5.175108674318449
Batch 3 Q-Learning Episode Data :  -5.002784418893757
Batch 4 Q-Learning Episode Data :  -4.860209606985793
Batch 5 Q-Learning Episode Data :  -4.7636010521059635
Batch 6 Q-Learning Episode Data :  -4.572201294601005
Batch 7 Q-Learning Episode Data :  -4.999481456412366
Batch 8 Q-Learning Episode Data :  -5.07395911172904
Batch 9 Q-Learning Episode Data :  -4.460583859765016
Batch 10 Q-Learning Episode Data :  -5.158054332489249
Batch 11 Q-Learning Episode Data :  -5.149385126607728
Batch 12 Q-Learning Episode Data :  -4.637382136591275
Batch 13 Q-Learning Episode Data :  -4.614391372119333
Batch 14 Q-Learning Episode Data :  -4.331656787985188
Batch 15 Q-Learning Episode Data :  -4.74948675841087
Batch 16 Q-Learning Episode Data :  -4.393893270981215
Batch 17 Q-Learning Episode Data :  -4.411235969695822
Batch 18 Q-Learning Episode Data :  -4.4404473011781604
Batch 19 Q-Learning Episode Dat

### c. tables

In [18]:
df_sarsa.to_csv('df_sarsa_env01.csv', index=False)
df_q_learning.to_csv('df_q_learning_env01.csv', index=False)