# Reinforced learning - Blackjack

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from BlackjackAgent import BlackjackMC

### Wstępne sprawdzenie działania środowiska

In [None]:
env = gym.make("Blackjack-v1", render_mode="human", sab=True) # sab=True -> wersja z obsługą „natural blackjack” i dokładną specyfiką polityk
observation, info = env.reset()

episode_over = False
while not episode_over:
    action = env.action_space.sample()
    state, reward, terminated, truncated, info = env.step(action)

    print(f"State: {observation}, Action: {action}, Reward: {reward}")

    episode_over = terminated or truncated

env.close()

### Trenowanie agenta

In [None]:
env = gym.make("Blackjack-v1", sab=True)
agent = BlackjackMC(env, epsilon=0.1, discount_factor=0.9)
agent.train(500000)

### Testy

##### Krzywa uczenia

In [None]:
rolling_avg = np.convolve(agent.reward_history, np.ones(1000)/1000, mode='valid')
plt.plot(rolling_avg)
plt.title("Średnia nagroda agenta w czasie")
plt.xlabel("Epizod")
plt.ylabel("Średnia nagroda")
plt.grid(True)
plt.show()

##### Polityka podejmowania decyzji

In [None]:
usable_ace = np.zeros((21 - 3, 10))  # player sum 4-21, dealer 1-10
no_usable_ace = np.zeros((21 - 3, 10))

for player in range(4, 22):
    for dealer in range(1, 11):
        state = (player, dealer, 1)
        usable_ace[player - 4, dealer - 1] = agent.get_action(state)

        state = (player, dealer, 0)
        no_usable_ace[player - 4, dealer - 1] = agent.get_action(state)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(8, 6)

ax1.imshow(usable_ace, cmap='gray', extent=[1, 10, 4, 21])
ax1.set_title('Jeśli gracz posiada używalnego asa')
ax1.set_xlabel("Karta krupiera")
ax1.set_xticks(ticks=np.arange(1, 11, 1))
ax1.set_ylabel("Suma wartości kart gracza")
ax1.set_yticks(ticks=np.arange(4, 22, 1))

ax2.imshow(no_usable_ace, cmap='gray', extent=[1, 10, 4, 21])
ax2.set_title('Jeśli gracz nie posiada używalnego asa')
ax2.set_xlabel("Karta krupiera")
ax2.set_xticks(ticks=np.arange(1, 11, 1))
ax2.set_ylabel("Suma wartości kart gracza")
_ = ax2.set_yticks(ticks=np.arange(4, 22, 1))

##### Performance agenta

In [None]:
def simulate(num_episodes: int, policy):
    wins, draws, losses = 0, 0, 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy(state)
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
        if reward == 1:
            wins += 1
        elif reward == 0:
            draws += 1
        else:
            losses += 1
    return wins, draws, losses

In [None]:
num_episodes = 1000
wins, draws, losses = simulate(num_episodes, agent.get_action)
print("Wytrenowany agent:")
print(f"Wygrane: {wins} ({(wins / num_episodes * 100):.2f}%)")
print(f"Remisy: {draws} ({(draws / num_episodes * 100):.2f}%)")
print(f"Przegrane: {losses} ({(losses / num_episodes * 100):.2f}%)")

In [None]:
num_episodes = 1000
wins, draws, losses = simulate(num_episodes, lambda _: env.action_space.sample())
print("Losowy agent:")
print(f"Wygrane: {wins} ({(wins / num_episodes * 100):.2f}%)")
print(f"Remisy: {draws} ({(draws / num_episodes * 100):.2f}%)")
print(f"Przegrane: {losses} ({(losses / num_episodes * 100):.2f}%)")