<a href="https://colab.research.google.com/github/manikanta-eng/Reinforcement-learning/blob/main/rml_lab_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install gymnasium
import gymnasium as gym # Changed to gymnasium
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

# Neural Network Policy
class PolicyNet(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super(PolicyNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 128),
            nn.ReLU(),
            nn.Linear(128, act_dim)
        )

    def forward(self, x):
        return self.net(x)

    def act(self, obs):
        logits = self.forward(obs)
        return torch.argmax(logits, dim=-1).item()

# Generate Expert Demonstrations (using a simple heuristic)
def generate_expert_data(env, num_episodes=10):
    data_obs, data_act = [], []
    for ep in range(num_episodes):
        obs, _ = env.reset() # gymnasium.reset() returns (observation, info)
        done = False
        while not done:
            # Expert heuristic: push toward the falling direction
            angle, angle_vel = obs[2], obs[3]
            action = 0 if angle < 0 else 1
            data_obs.append(obs)
            data_act.append(action)
            obs, reward, terminated, truncated, _ = env.step(action) # gymnasium.step() returns (obs, reward, terminated, truncated, info)
            done = terminated or truncated # 'done' is now either terminated or truncated
    return np.array(data_obs), np.array(data_act)

# Train Behavioral Cloning Policy
def train_behavioral_cloning():
    env = gym.make("CartPole-v1") # gymnasium.make() defaults to new API
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n

    expert_obs, expert_act = generate_expert_data(env)
    policy = PolicyNet(obs_dim, act_dim)
    optimizer = optim.Adam(policy.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    # Convert to torch tensors
    X = torch.tensor(expert_obs, dtype=torch.float32)
    y = torch.tensor(expert_act, dtype=torch.long)

    # Train imitation model
    for epoch in range(100):
        logits = policy(X)
        loss = loss_fn(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

    # Evaluate
    total_rewards = []
    for ep in range(5):
        obs, _ = env.reset() # gymnasium.reset() returns (observation, info)
        done, ep_reward = False, 0
        while not done:
            action = policy.act(torch.tensor(obs, dtype=torch.float32))
            obs, reward, terminated, truncated, _ = env.step(action) # gymnasium.step() returns (obs, reward, terminated, truncated, info)
            done = terminated or truncated # 'done' is now either terminated or truncated
            ep_reward += reward
        total_rewards.append(ep_reward)

    print(f"Average Reward (Behavioral Cloning): {np.mean(total_rewards)}")

if __name__ == "__main__":
    train_behavioral_cloning()

Epoch 0, Loss: 0.7202
Epoch 10, Loss: 0.6901
Epoch 20, Loss: 0.6724
Epoch 30, Loss: 0.6532
Epoch 40, Loss: 0.6332
Epoch 50, Loss: 0.6117
Epoch 60, Loss: 0.5883
Epoch 70, Loss: 0.5630
Epoch 80, Loss: 0.5357
Epoch 90, Loss: 0.5066
Average Reward (Behavioral Cloning): 103.0
