In [4]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
import pandas as pd

# Umgebung initialisieren
env = gym.make("CartPole-v1")
obs_size = env.observation_space.shape[0]  # 4
n_actions = env.action_space.n  # 2 (links/rechts)

# Cross-Entropy-Hyperparameter
N = 100  # Anzahl gesampelter Policies pro Runde
elite_frac = 0.2
n_elite = int(N * elite_frac)
n_iterations = 30
max_episode_length = 500

# Policy-Parameter (lineares Modell: action = argmax(W @ obs))
mu = np.zeros((n_actions, obs_size))
std = np.ones((n_actions, obs_size))

# Zum Plotten
mean_rewards = []

# Policy ausführen
def evaluate_policy(weights):
    total_reward = 0
    obs = env.reset()
    if isinstance(obs, tuple):  # für Gym v0.26+
        obs = obs[0]
    for _ in range(max_episode_length):
        logits = weights @ obs
        action = np.argmax(logits)
        obs, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        if terminated or truncated:
            break
    return total_reward

# Trainingsschleife
for iteration in range(n_iterations):
    # 1. Policies generieren
    policies = [np.random.normal(mu, std) for _ in range(N)]

    # 2. Rewards evaluieren
    rewards = [evaluate_policy(p) for p in policies]

    # 3. Elite auswählen
    elite_indices = np.argsort(rewards)[-n_elite:]
    elite_policies = np.array([policies[i] for i in elite_indices])

    # 4. Mittelwert & Streuung aktualisieren
    mu = elite_policies.mean(axis=0)
    std = elite_policies.std(axis=0)

    # Logging
    avg_reward = np.mean(rewards)
    mean_rewards.append(avg_reward)
    print(f"Iteration {iteration+1}: Ø Reward = {avg_reward:.2f}")

# Ergebnis visualisieren
plt.figure(figsize=(10, 4))
plt.plot(mean_rewards)
plt.xlabel("Iteration")
plt.ylabel("Durchschnittlicher Reward")
plt.title("CEM mit CartPole (lineare Policy)")
plt.grid(True)
plot_path = "cem_cartpole_plot.png"
plt.savefig(plot_path)
plt.close()

# Beste Policy testen
best_policy = mu
obs = env.reset()
if isinstance(obs, tuple):
    obs = obs[0]
total_reward = 0
for _ in range(max_episode_length):
    action = np.argmax(best_policy @ obs)
    obs, reward, terminated, truncated, _ = env.step(action)
    total_reward += reward
    if terminated or truncated:
        break


plot_path, total_reward


Iteration 1: Ø Reward = 67.91
Iteration 2: Ø Reward = 206.60
Iteration 3: Ø Reward = 281.57
Iteration 4: Ø Reward = 361.57
Iteration 5: Ø Reward = 388.00
Iteration 6: Ø Reward = 448.25
Iteration 7: Ø Reward = 476.22
Iteration 8: Ø Reward = 468.65
Iteration 9: Ø Reward = 469.06
Iteration 10: Ø Reward = 482.04
Iteration 11: Ø Reward = 470.92
Iteration 12: Ø Reward = 495.04
Iteration 13: Ø Reward = 499.99
Iteration 14: Ø Reward = 499.59
Iteration 15: Ø Reward = 500.00
Iteration 16: Ø Reward = 498.72
Iteration 17: Ø Reward = 500.00
Iteration 18: Ø Reward = 500.00
Iteration 19: Ø Reward = 499.27
Iteration 20: Ø Reward = 500.00
Iteration 21: Ø Reward = 499.17
Iteration 22: Ø Reward = 498.43
Iteration 23: Ø Reward = 499.92
Iteration 24: Ø Reward = 500.00
Iteration 25: Ø Reward = 499.14
Iteration 26: Ø Reward = 496.96
Iteration 27: Ø Reward = 499.26
Iteration 28: Ø Reward = 497.36
Iteration 29: Ø Reward = 499.59
Iteration 30: Ø Reward = 500.00


('cem_cartpole_plot.png', 500.0)

In [5]:
import pandas as pd

df = pd.DataFrame({
    "Iteration": list(range(1, n_iterations + 1)),
    "Average Reward": mean_rewards
})

print(df.tail(5))  # zeige die letzten 5 Iterationen


    Iteration  Average Reward
25         26          496.96
26         27          499.26
27         28          497.36
28         29          499.59
29         30          500.00
