In [None]:
# 필요한 라이브러리 임포트
import gym
import numpy as np
import random
import time
from IPython.display import clear_output

# 환경 생성
env = gym.make('CartPole-v1')

# Q-러닝 파라미터 설정
state_space_size = [20] * len(env.observation_space.high)
state_bins = [np.linspace(-4.8, 4.8, state_space_size[0] - 1),
              np.linspace(-4, 4, state_space_size[1] - 1),
              np.linspace(-0.418, 0.418, state_space_size[2] - 1),
              np.linspace(-4, 4, state_space_size[3] - 1)]

action_space_size = env.action_space.n
q_table = np.zeros(state_space_size + [action_space_size])

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

num_episodes = 10000
max_steps_per_episode = 200

# 상태 이산화 함수
def discretize_state(state):
    state_discretized = []
    for i in range(len(state)):
        state_discretized.append(int(np.digitize(state[i], state_bins[i])))
    return tuple(state_discretized)

# 학습 시작
rewards_all_episodes = []

for episode in range(num_episodes):
    state = env.reset()
    state_discretized = discretize_state(state)
    done = False
    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        # 탐험-이용 전략 결정
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            # 최적 행동 선택 (이용)
            action = np.argmax(q_table[state_discretized])
        else:
            # 무작위 행동 선택 (탐험)
            action = env.action_space.sample()

        # 행동 수행 및 다음 상태, 보상, 종료 여부 받아오기
        new_state, reward, done, info = env.step(action)
        new_state_discretized = discretize_state(new_state)

        # Q-값 업데이트
        q_table[state_discretized + (action,)] = q_table[state_discretized + (action,)] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state_discretized]))

        # 상태 업데이트
        state_discretized = new_state_discretized
        rewards_current_episode += reward

        if done:
            break

    # 탐험률 감소
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)

    # 진행 상황 출력
    if (episode + 1) % 1000 == 0:
        print(f"에피소드: {episode + 1}, 평균 보상: {np.mean(rewards_all_episodes[-1000:]):.2f}")

# 학습 결과 출력
print("\n\n***** 학습 완료 *****\n")

# 에피소드당 보상 그래프 그리기
import matplotlib.pyplot as plt

plt.plot(range(num_episodes), rewards_all_episodes)
plt.xlabel('에피소드')
plt.ylabel('보상')
plt.title('에피소드당 보상')
plt.show()

# 학습된 정책 테스트
for episode in range(5):
    state = env.reset()
    state_discretized = discretize_state(state)
    done = False
    print(f"\n***** 에피소드 {episode + 1} *****\n")
    time.sleep(1)

    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.05)

        # 최적 행동 선택
        action = np.argmax(q_table[state_discretized])

        new_state, reward, done, info = env.step(action)
        new_state_discretized = discretize_state(new_state)
        state_discretized = new_state_discretized

        if done:
            clear_output(wait=True)
            env.render()
            if step >= 199:
                print(f"\n에피소드 {episode + 1}: 성공적으로 완료!\n")
            else:
                print(f"\n에피소드 {episode + 1}: 실패\n")
            time.sleep(2)
            break

env.close()