<a href="https://colab.research.google.com/github/mengju06/Python/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

# Q-table

In [None]:
num_states = int(1 / 0.05) + 1  # State數量，每0.05一個間隔
num_actions = 2  # 兩種Action，增加或減少0.05
q_table = np.zeros((num_states, num_actions))

# 參數

In [None]:
alpha = 0.1  # 學習速率
gamma = 0.9  # 折扣因子
epsilon = 0.2  # epsilon-greedy

# 將x映射

In [None]:
def map_to_state(x):
    return int(x / 0.05)

# 函數

In [None]:
def target_function(x):
    return -(x - 0.5)**2 - 28

# Q-learning

In [None]:
num_episodes = 50
threshold = 0.003
for episode in range(num_episodes):

  state = map_to_state(0)
  while True:
    # epsilon-greedy
    if np.random.rand() < epsilon:
        action = np.random.choice(num_actions)
    else:
        action = np.argmax(q_table[state, :])

    if action == 0:
        new_x = min(1, (state + 1) * 0.05)
    else:
        new_x = max(0, (state - 1) * 0.05)

    new_state = map_to_state(new_x)
    reward = target_function(new_x)

    # 更新Q
    if new_x <= 1:
      q_table[state, action] = (1 - alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[new_state, :]))

    # 印出Q-table
    print(f"Q({state}, {action}) = {q_table[state, action]:.4f}, x: {new_x:.2f}, Function value: {target_function(new_x):.4f}")

    # 轉到新狀態
    state = new_state

    # 檢查終止條件
    if np.abs(target_function(new_x) - target_function(map_to_state(new_x))) < threshold:
      print(f"Terminating episode {episode} because the function value difference is below the threshold.")
      break

Q(0, 0) = -2.8203, x: 0.05, Function value: -28.2025
Q(1, 0) = -2.8160, x: 0.10, Function value: -28.1600
Q(2, 0) = -2.8123, x: 0.15, Function value: -28.1225
Q(3, 0) = -2.8090, x: 0.20, Function value: -28.0900
Q(4, 0) = -2.8063, x: 0.25, Function value: -28.0625
Q(5, 0) = -2.8040, x: 0.30, Function value: -28.0400
Q(6, 1) = -2.8063, x: 0.25, Function value: -28.0625
Q(5, 1) = -2.8090, x: 0.20, Function value: -28.0900
Q(4, 1) = -2.8123, x: 0.15, Function value: -28.1225
Q(3, 1) = -2.8160, x: 0.10, Function value: -28.1600
Q(2, 1) = -2.8203, x: 0.05, Function value: -28.2025
Q(1, 1) = -2.8250, x: 0.00, Function value: -28.2500
Terminating episode 0 because the function value difference is below the threshold.
Q(0, 0) = -5.6119, x: 0.05, Function value: -28.2025
Q(1, 0) = -5.6035, x: 0.10, Function value: -28.1600
Q(2, 0) = -5.5961, x: 0.15, Function value: -28.1225
Q(3, 0) = -5.5897, x: 0.20, Function value: -28.0900
Q(4, 0) = -5.5842, x: 0.25, Function value: -28.0625
Q(5, 0) = -5.32

# 訓練後

In [None]:
# 將最後一筆資料設為負無窮大
q_table[-1, :] = -np.inf

# 找到最大值對應的x
max_value_x = np.argmax([target_function(x) for x in np.arange(0, 1.05, 0.05)]) * 0.05
max_value = target_function(max_value_x)

print(f"最大值對應的x：{max_value_x:.2f}")
print(f"最大值：{max_value:.4f}")

q_table_df = pd.DataFrame(q_table, columns=["+0.05", "-0.05"])
q_table_df.index.name = 'State'
print(q_table_df)

最大值對應的x：0.50
最大值：-28.0000
           +0.05      -0.05
State                      
0     -60.924351 -62.258141
1     -62.310485 -62.361930
2     -59.608804 -63.524227
3     -61.604013 -60.815630
4     -61.265103 -62.734819
5     -61.462643 -62.628697
6     -61.549295 -62.712031
7     -61.870706 -62.510195
8     -61.987902 -62.768877
9     -59.752577 -62.723849
10    -58.811026 -60.312007
11    -57.084486 -59.310591
12    -57.506405 -57.865983
13    -57.360293 -58.287566
14    -57.181430 -58.175206
15    -57.355274 -58.188110
16    -57.598801 -58.131527
17    -57.842599 -58.271649
18    -57.858435 -58.277562
19    -55.678194 -57.876463
20          -inf       -inf


In [None]:
# import numpy as np
# import pandas as pd

# # 超參數
# alpha = 0.1  # 學習速率
# gamma = 0.9  # 折扣因子
# epsilon = 0.2  # epsilon-greedy

# # 初始化 Q-table
# num_states_x = int(1 / 0.05) + 1
# num_states_y = int(1 / 0.05) + 1
# num_actions = 4  # 四種行動，x增加/減少、y增加/減少
# q_table = np.zeros((num_states_x, num_states_y, num_actions))

# # 定義映射函數
# def map_to_state(x, y):
#     state_x = int(x / 0.05)
#     state_y = int(y / 0.05)
#     return state_x, state_y

# # 定義目標函數
# def target_function(x, y):
#     return -(x - 0.5)**2 - (y - 0.5)**2 - 28

# # 訓練參數
# num_episodes = 50
# threshold = 0.003

# # 訓練迴圈
# for episode in range(num_episodes):
#     state_x, state_y = map_to_state(0, 0)
#     while True:
#         # epsilon-greedy
#         if np.random.rand() < epsilon:
#             action = np.random.choice(num_actions)
#         else:
#             action = np.argmax(q_table[state_x, state_y, :])

#         # 根據行動選擇更新 x 和 y
#         if action == 0:
#             new_x = min(1, state_x + 0.05)
#             new_y = state_y  # No change in y
#         elif action == 1:
#             new_x = max(0, state_x - 0.05)
#             new_y = state_y  # No change in y
#         elif action == 2:
#             new_x = state_x  # No change in x
#             new_y = min(1, state_y + 0.05)
#         else:
#             new_x = state_x  # No change in x
#             new_y = max(0, state_y - 0.05)

#         new_state_x, new_state_y = map_to_state(new_x, new_y)
#         reward = target_function(new_x, new_y)

#         # 更新Q值
#         if new_x <= 1 and new_y <= 1:
#             q_table[state_x, state_y, action] = (1 - alpha) * q_table[state_x, state_y, action] + alpha * (reward + gamma * np.max(q_table[new_state_x, new_state_y, :]))

#         # 轉到新狀態
#         state_x, state_y = new_state_x, new_state_y

#         # 檢查終止條件
#         if np.abs(reward - target_function(*map_to_state(new_x, new_y))) < threshold:
#             print(f"Terminating episode {episode} because the function value difference is below the threshold.")
#             break

# # 找到最大值對應的 x 和 y
# max_values = np.unravel_index(np.argmax(q_table), q_table.shape)
# max_x = max_values[0] * 0.05
# max_y = max_values[1] * 0.05
# max_value = target_function(max_x, max_y)

# print(f"最大值對應的 x：{max_x:.2f}")
# print(f"最大值對應的 y：{max_y:.2f}")
# print(f"最大值：{max_value:.4f}")


IndexError: ignored