In [1]:
import torch
print(torch.cuda.is_available())  # sollte True sein

True


In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
import matplotlib.pyplot as plt
import pandas as pd

# CUDA prüfen
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Umgebung
env = gym.make("CartPole-v1")
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

# Hyperparameter
N = 100              # Anzahl Policy-Netzwerke pro Runde
elite_frac = 0.2     # Top X % behalten
n_elite = int(N * elite_frac)
n_iterations = 30
max_episode_length = 500
hidden_size = 32     # Anzahl Neuronen in versteckter Schicht
lr = 1e-2

# Policy-Netzwerk
class PolicyNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.model(x)

# Policy bewerten
def evaluate_policy(policy):
    policy.eval()
    total_reward = 0
    obs = env.reset()
    if isinstance(obs, tuple): obs = obs[0]
    for _ in range(max_episode_length):
        obs_tensor = torch.FloatTensor(obs).to(device)
        with torch.no_grad():
            logits = policy(obs_tensor)
        action = torch.argmax(logits).item()
        obs, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        if terminated or truncated:
            break
    return total_reward

# Initiale Population von Netzwerken
def sample_population(base_policy, std=0.1):
    population = []
    for _ in range(N):
        new_policy = PolicyNet().to(device)
        new_policy.load_state_dict(base_policy.state_dict())
        with torch.no_grad():
            for p in new_policy.parameters():
                p.add_(torch.randn_like(p) * std)
        population.append(new_policy)
    return population

# Trainingsschleife
base_policy = PolicyNet().to(device)
mean_rewards = []

for iteration in range(n_iterations):
    population = sample_population(base_policy, std=0.1)
    rewards = [evaluate_policy(p) for p in population]

    # Top-Performer auswählen
    elite_indices = np.argsort(rewards)[-n_elite:]
    elite_policies = [population[i] for i in elite_indices]

    # Neue Basis-Policy trainieren (Supervised Learning auf Top-Population)
    optimizer = optim.Adam(base_policy.parameters(), lr=lr)

    # Sammle Beobachtungen + Aktionen der Elite-Population
    obs_list, act_list = [], []
    for policy in elite_policies:
        obs = env.reset()
        if isinstance(obs, tuple): obs = obs[0]
        for _ in range(max_episode_length):
            obs_tensor = torch.FloatTensor(obs).to(device)
            with torch.no_grad():
                logits = policy(obs_tensor)
            action = torch.argmax(logits).item()
            obs_list.append(obs)
            act_list.append(action)
            obs, _, terminated, truncated, _ = env.step(action)
            if terminated or truncated:
                break

    # Training auf Elite-Beobachtungen
    base_policy.train()
    obs_batch = torch.FloatTensor(obs_list).to(device)
    act_batch = torch.LongTensor(act_list).to(device)

    logits = base_policy(obs_batch)
    loss = nn.CrossEntropyLoss()(logits, act_batch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    avg_reward = np.mean(rewards)
    mean_rewards.append(avg_reward)
    print(f"Iteration {iteration+1}: Ø Reward = {avg_reward:.2f}")

# Plot
plt.figure(figsize=(10, 4))
plt.plot(mean_rewards)
plt.xlabel("Iteration")
plt.ylabel("Average Reward")
plt.title("Cross Entropy Method mit Neuronalen Netz (CartPole)")
plt.grid(True)
plot_path = "cem_nn_cartpole_plot.png"
plt.savefig(plot_path)
plt.close()

# Beste Policy testen
final_policy = base_policy.eval()
obs = env.reset()
if isinstance(obs, tuple): obs = obs[0]
total_reward = 0
for _ in range(max_episode_length):
    obs_tensor = torch.FloatTensor(obs).to(device)
    with torch.no_grad():
        action = torch.argmax(final_policy(obs_tensor)).item()
    obs, reward, terminated, truncated, _ = env.step(action)
    total_reward += reward
    if terminated or truncated:
        break
        
import pandas as pd

df = pd.DataFrame({
    "Iteration": list(range(1, n_iterations + 1)),
    "Average Reward": mean_rewards
})

print(df.tail())  # zeigt letzte 5 Zeilen


plot_path, total_reward, device.type


Iteration 1: Ø Reward = 12.98
Iteration 2: Ø Reward = 15.92
Iteration 3: Ø Reward = 26.77
Iteration 4: Ø Reward = 30.53
Iteration 5: Ø Reward = 37.63
Iteration 6: Ø Reward = 40.87
Iteration 7: Ø Reward = 43.32
Iteration 8: Ø Reward = 48.06
Iteration 9: Ø Reward = 59.81
Iteration 10: Ø Reward = 50.81
Iteration 11: Ø Reward = 59.40
Iteration 12: Ø Reward = 57.00
Iteration 13: Ø Reward = 62.67
Iteration 14: Ø Reward = 57.97
Iteration 15: Ø Reward = 62.27
Iteration 16: Ø Reward = 64.21
Iteration 17: Ø Reward = 70.72
Iteration 18: Ø Reward = 75.98
Iteration 19: Ø Reward = 78.98
Iteration 20: Ø Reward = 76.78
Iteration 21: Ø Reward = 70.79
Iteration 22: Ø Reward = 81.61
Iteration 23: Ø Reward = 77.35
Iteration 24: Ø Reward = 81.58
Iteration 25: Ø Reward = 90.07
Iteration 26: Ø Reward = 88.70
Iteration 27: Ø Reward = 89.69
Iteration 28: Ø Reward = 89.79
Iteration 29: Ø Reward = 93.07
Iteration 30: Ø Reward = 92.19
    Iteration  Average Reward
25         26           88.70
26         27      

('cem_nn_cartpole_plot.png', 135.0, 'cuda')