## 1. Importy a p≈ô√≠prava

In [None]:
import numpy as np
import random
import pickle
from pathlib import Path
from multipong.ai import RLPongEnv, encode_state, update_q_value, get_q_value

# Nastaven√≠ seed pro reprodukovatelnost
random.seed(42)
np.random.seed(42)

## 2. Inicializace prost≈ôed√≠ a hyperparametr≈Ø

In [None]:
# Prost≈ôed√≠
env = RLPongEnv(width=400, height=300, paddle_height=60, ball_speed=4)

# Hyperparametry Q-learningu
episodes = 2000
alpha = 0.15      # Learning rate
gamma = 0.95      # Discount factor
epsilon = 0.2     # Exploration rate
decay = 0.995     # Epsilon decay (pokud chce≈° sni≈æovat exploraci)
num_bins = 8      # Poƒçet bin≈Ø pro diskretizaci

# Akce
ACTIONS = [0, 1, 2]  # 0=stay, 1=up, 2=down

# Q-tabulka
Q = {}

print(f"üéÆ Tr√©nink RL agenta na RLPongEnv")
print(f"üìä Hyperparametry: Œ±={alpha}, Œ≥={gamma}, Œµ={epsilon}")
print(f"üîÅ Epizod: {episodes}")

## 3. Tr√©ninkov√° smyƒçka Q-learningu

In [None]:
rewards_per_episode = []
current_epsilon = epsilon

for ep in range(episodes):
    state = env.reset()
    state_key = encode_state(state, env, num_bins=num_bins)
    total_reward = 0
    steps = 0

    done = False
    while not done and steps < 500:  # max 500 krok≈Ø na epizodu
        # Epsilon-greedy: explorace vs. exploatace
        if random.random() < current_epsilon:
            action_idx = random.randint(0, len(ACTIONS) - 1)
        else:
            q_vals = get_q_value(Q, state_key)
            action_idx = int(np.argmax(q_vals))

        action = ACTIONS[action_idx]
        next_state, reward, done = env.step(action)
        total_reward += reward
        steps += 1

        next_state_key = encode_state(next_state, env, num_bins=num_bins)

        # Q-learning update
        update_q_value(Q, state_key, action_idx, next_state_key, reward, alpha=alpha, gamma=gamma)

        state_key = next_state_key

    rewards_per_episode.append(total_reward)
    current_epsilon *= decay  # Sni≈æuj exploraci

    # Progress report
    if (ep + 1) % 200 == 0:
        avg_reward = np.mean(rewards_per_episode[-100:])
        print(f"Ep {ep+1:4d}/{episodes} | Avg reward (100): {avg_reward:7.2f} | Œµ: {current_epsilon:.4f}")

print(f"‚úÖ Tr√©nink dokonƒçen! Q-tabulka m√° {len(Q)} stav≈Ø.")

## 4. Anal√Ωza v√Ωsledk≈Ø

In [None]:
# Statistika
print(f"\nüìà Statistika tr√©ninku:")
print(f"  Poƒçet stav≈Ø v Q-tabulce: {len(Q)}")
print(f"  Poƒçet akc√≠: {len(ACTIONS)}")
print(f"  Pr≈Ømƒõrn√° odmƒõna (prvn√≠ch 100): {np.mean(rewards_per_episode[:100]):.2f}")
print(f"  Pr≈Ømƒõrn√° odmƒõna (posledn√≠ch 100): {np.mean(rewards_per_episode[-100:]):.2f}")
print(f"  Min. odmƒõna: {np.min(rewards_per_episode):.2f}")
print(f"  Max. odmƒõna: {np.max(rewards_per_episode):.2f}")

## 5. Vizualizace k≈ôivky uƒçen√≠

In [None]:
try:
    import matplotlib.pyplot as plt

    # Vyhlazen√° k≈ôivka (klouzav√Ω pr≈Ømƒõr)
    window = 50
    smoothed = [
        np.mean(rewards_per_episode[max(0, i - window) : i + 1])
        for i in range(len(rewards_per_episode))
    ]

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Graf 1: Surov√© a vyhlazen√© odmƒõny
    axes[0].plot(rewards_per_episode, alpha=0.3, label="Surov√© odmƒõny")
    axes[0].plot(smoothed, linewidth=2, label="Vyhlazeno (klouzav√Ω pr≈Ømƒõr)")
    axes[0].set_xlabel("Epizoda")
    axes[0].set_ylabel("Celkov√° odmƒõna za epizodu")
    axes[0].set_title("Kr√°tka uƒçen√≠ RL agenta")
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)

    # Graf 2: Pr≈Ømƒõr posledn√≠ch N epizod
    window2 = 100
    running_avg = [
        np.mean(rewards_per_episode[max(0, i - window2) : i + 1])
        for i in range(len(rewards_per_episode))
    ]
    axes[1].plot(running_avg, linewidth=2, color="green")
    axes[1].set_xlabel("Epizoda")
    axes[1].set_ylabel("Pr≈Ømƒõrn√° odmƒõna (100 epizod)")
    axes[1].set_title("Konvergence - pr≈Ømƒõr posledn√≠ch 100 epizod")
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()
    print("‚úÖ Grafy vykresleny")

except ImportError:
    print("‚ö†Ô∏è matplotlib nen√≠ nainstalov√°n, grafy se nevykresluji")

## 6. Evaluace natr√©novan√©ho agenta

In [None]:
# Evaluace bez explorace (pure exploitation)
eval_episodes = 100
eval_rewards = []

for ep in range(eval_episodes):
    state = env.reset()
    state_key = encode_state(state, env, num_bins=num_bins)
    total_reward = 0
    done = False
    steps = 0

    while not done and steps < 500:
        # V≈ædy zvol√≠me best action (greedy)
        q_vals = get_q_value(Q, state_key)
        action_idx = int(np.argmax(q_vals))
        action = ACTIONS[action_idx]

        next_state, reward, done = env.step(action)
        total_reward += reward
        steps += 1

        state_key = encode_state(next_state, env, num_bins=num_bins)

    eval_rewards.append(total_reward)

print(f"\nüéØ EVALUACE (bez explorace):")
print(f"  Epizod: {eval_episodes}")
print(f"  Pr≈Ømƒõrn√° odmƒõna: {np.mean(eval_rewards):.2f}")
print(f"  Std. dev: {np.std(eval_rewards):.2f}")
print(f"  Min: {np.min(eval_rewards):.2f}")
print(f"  Max: {np.max(eval_rewards):.2f}")

## 7. Ulo≈æen√≠ modelu

In [None]:
# Vytvo≈ô slo≈æku pro modely
models_dir = Path("multipong/ai/models")
models_dir.mkdir(parents=True, exist_ok=True)

model_path = models_dir / "q_table_pong.pkl"

# Ulo≈æ Q-tabulku
with open(model_path, "wb") as f:
    pickle.dump(Q, f)

print(f"‚úÖ Model ulo≈æen: {model_path}")
print(f"   Velikost: {model_path.stat().st_size / 1024:.1f} KB")

## 8. Naƒçten√≠ a test

In [None]:
# Vyzkou≈°ej naƒçten√≠ modelu
with open(model_path, "rb") as f:
    Q_loaded = pickle.load(f)

print(f"‚úÖ Model naƒçten zpƒõt!")
print(f"   Stav≈Ø v Q-tabulce: {len(Q_loaded)}")
print(f"   Kontrola: Q-tabulka je identick√°: {Q == Q_loaded}")