In [None]:

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os

# TensorBoard support
try:
    from torch.utils.tensorboard import SummaryWriter
    TENSORBOARD_AVAILABLE = True
except ImportError:
    TENSORBOARD_AVAILABLE = False
    SummaryWriter = None
    print("TensorBoard not available. Scalar logging will be disabled.")

# Create environment
env = gym.make('CartPole-v0')
env = env.unwrapped  # same as your TF code

# Seeding for reproducibility (handles both old & new gym APIs)
seed = 1
np.random.seed(seed)
torch.manual_seed(seed)
try:
    env.reset(seed=seed)
except TypeError:
    # Older gym versions:
    env.seed(seed)

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Hyperparameters
max_episodes_pg = 300           # vanilla policy gradient
max_episodes_pg_baseline = 300  # policy gradient with baseline
learning_rate = 0.01
gamma = 0.95  # Discount rate

os.makedirs("./models", exist_ok=True)

print("State size:", state_size)
print("Action size:", action_size)

In [None]:
def reset_env(env):
    """
    Handles both old and new Gym reset API:
    - old: obs = env.reset()
    - new: obs, info = env.reset()
    """
    result = env.reset()
    if isinstance(result, tuple):
        state, _ = result
    else:
        state = result
    return state

def step_env(env, action):
    """
    Handles both old and new Gym step API:
    - old: obs, reward, done, info
    - new: obs, reward, terminated, truncated, info
    """
    result = env.step(action)
    if len(result) == 5:
        next_state, reward, terminated, truncated, info = result
        done = terminated or truncated
    else:
        next_state, reward, done, info = result
    return next_state, reward, done, info

def discount_rewards(episode_rewards, gamma=gamma):
    """
    Computes discounted returns without normalization.
    Used for training the value function (baseline).
    """
    discounted = np.zeros_like(episode_rewards, dtype=np.float32)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted[i] = cumulative
    return discounted

def discount_and_normalize_rewards(episode_rewards, gamma=gamma):
    """
    Computes discounted returns and normalizes them.
    Used by vanilla policy gradient (REINFORCE).
    """
    discounted = discount_rewards(episode_rewards, gamma)
    mean = np.mean(discounted)
    std = np.std(discounted) + 1e-8  # avoid division by zero
    return (discounted - mean) / std



In [None]:
class PolicyNetwork(nn.Module):
    """
    Matches your TF network structure:
    fc1:  state_size -> 10 (ReLU)
    fc2:  10 -> action_size (ReLU)
    fc3:  action_size -> action_size (logits, no activation)
    """
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 10)
        self.fc2 = nn.Linear(10, action_size)
        self.fc3 = nn.Linear(action_size, action_size)
        
        # Xavier initialization similar to tf.contrib.layers.xavier_initializer()
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.xavier_uniform_(self.fc3.weight)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        logits = self.fc3(x)
        return logits  # raw logits


class ValueNetwork(nn.Module):
    """
    Simple state-value function V(s) used as a learned baseline.
    """
    def __init__(self, state_size):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 16)
        self.fc2 = nn.Linear(16, 1)

        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        value = self.fc2(x)  # shape [batch, 1]
        return value.squeeze(-1)  # shape [batch]



In [None]:
def run_random_agent(env, num_episodes=10, render=False):
    rewards = []
    for episode in range(num_episodes):
        state = reset_env(env)
        done = False
        total_reward = 0

        print("****************************************************")
        print("Random agent - EPISODE", episode)

        while not done:
            if render:
                env.render()

            # Uniform random action (random guess)
            action = env.action_space.sample()

            next_state, reward, done, _ = step_env(env, action)
            total_reward += reward
            state = next_state

        rewards.append(total_reward)
        print("Score:", total_reward)

    avg_reward = np.mean(rewards)
    print("Random agent average reward over", num_episodes, "episodes:", avg_reward)
    return rewards

# Run random agent baseline
random_rewards = run_random_agent(env, num_episodes=10, render=False)



In [None]:
# Vanilla policy gradient (REINFORCE) setup
policy_pg = PolicyNetwork(state_size, action_size)
criterion_pg = nn.CrossEntropyLoss(reduction='none')  # per-sample negative log-prob
optimizer_pg = optim.Adam(policy_pg.parameters(), lr=learning_rate)

MODEL_PATH_PG = "./models/cartpole_pg_pytorch.pth"

if TENSORBOARD_AVAILABLE:
    writer_pg = SummaryWriter(log_dir="./runs/pg_cartpole_pytorch")
else:
    writer_pg = None

def train_policy_gradient(env, num_episodes=max_episodes_pg, render=False):
    all_rewards = []

    for episode in range(num_episodes):
        state = reset_env(env)
        episode_states, episode_actions, episode_rewards = [], [], []
        done = False

        while not done:
            if render:
                env.render()

            state_tensor = torch.from_numpy(state).float().unsqueeze(0)
            logits = policy_pg(state_tensor)
            probs = F.softmax(logits, dim=1).detach().numpy().ravel()
            action = np.random.choice(action_size, p=probs)

            next_state, reward, done, _ = step_env(env, action)

            episode_states.append(state)
            episode_actions.append(action)
            episode_rewards.append(reward)

            state = next_state

        # Episode summary
        episode_reward_sum = np.sum(episode_rewards)
        all_rewards.append(episode_reward_sum)
        mean_reward = np.mean(all_rewards)
        max_reward = np.max(all_rewards)

        print("==========================================")
        print("Vanilla PG - Episode:", episode)
        print("Reward:", episode_reward_sum)
        print("Mean reward:", mean_reward)
        print("Max reward so far:", max_reward)

        # Discount and normalize rewards
        discounted_rewards = discount_and_normalize_rewards(episode_rewards, gamma)

        # Convert to tensors
        states_tensor = torch.tensor(episode_states, dtype=torch.float32)
        actions_tensor = torch.tensor(episode_actions, dtype=torch.long)
        rewards_tensor = torch.tensor(discounted_rewards, dtype=torch.float32)

        # Forward pass
        logits = policy_pg(states_tensor)
        neg_log_prob = criterion_pg(logits, actions_tensor)  # shape [T]

        # Policy gradient loss
        loss = torch.mean(neg_log_prob * rewards_tensor)

        # Backprop and update
        optimizer_pg.zero_grad()
        loss.backward()
        optimizer_pg.step()

        # TensorBoard logging
        if writer_pg is not None:
            writer_pg.add_scalar("Loss", loss.item(), episode)
            writer_pg.add_scalar("Reward_mean", mean_reward, episode)

        # Save model periodically
        if episode % 100 == 0:
            torch.save(policy_pg.state_dict(), MODEL_PATH_PG)
            print("Saved vanilla PG model at episode", episode)

    # Final save
    torch.save(policy_pg.state_dict(), MODEL_PATH_PG)
    print("Final vanilla PG model saved.")
    if writer_pg is not None:
        writer_pg.flush()

    return all_rewards

# Train vanilla policy gradient
pg_rewards = train_policy_gradient(env, num_episodes=max_episodes_pg, render=False)


In [None]:
def evaluate_policy(env, policy, num_episodes=10, render=False, title="Policy"):
    rewards = []
    policy.eval()

    with torch.no_grad():
        for episode in range(num_episodes):
            state = reset_env(env)
            done = False
            total_reward = 0

            print("****************************************************")
            print(f"{title} - EPISODE", episode)

            while not done:
                if render:
                    env.render()

                state_tensor = torch.from_numpy(state).float().unsqueeze(0)
                logits = policy(state_tensor)
                probs = F.softmax(logits, dim=1).numpy().ravel()
                action = np.random.choice(action_size, p=probs)

                next_state, reward, done, _ = step_env(env, action)
                total_reward += reward
                state = next_state

            rewards.append(total_reward)
            print("Score:", total_reward)

    avg_reward = np.mean(rewards)
    print(f"{title} average score over {num_episodes} episodes:", avg_reward)
    return rewards

# Load & evaluate vanilla PG model
loaded_policy_pg = PolicyNetwork(state_size, action_size)
loaded_policy_pg.load_state_dict(torch.load(MODEL_PATH_PG, map_location=torch.device('cpu')))

pg_eval_rewards = evaluate_policy(env, loaded_policy_pg,
                                  num_episodes=10,
                                  render=False,
                                  title="Vanilla PG")


In [None]:
# Policy gradient with baseline (learned state-value function)
policy_pg_baseline = PolicyNetwork(state_size, action_size)
value_baseline = ValueNetwork(state_size)

# Single optimizer over both networks
optimizer_pg_baseline = optim.Adam(
    list(policy_pg_baseline.parameters()) + list(value_baseline.parameters()),
    lr=learning_rate
)

MODEL_PATH_PG_BASELINE = "./models/cartpole_pg_baseline_pytorch.pth"

if TENSORBOARD_AVAILABLE:
    writer_pg_baseline = SummaryWriter(log_dir="./runs/pg_baseline_cartpole_pytorch")
else:
    writer_pg_baseline = None

def train_policy_gradient_with_baseline(env,
                                        num_episodes=max_episodes_pg_baseline,
                                        render=False,
                                        value_loss_coef=0.5):
    all_rewards = []

    for episode in range(num_episodes):
        state = reset_env(env)
        episode_states, episode_actions, episode_rewards = [], [], []
        done = False

        while not done:
            if render:
                env.render()

            state_tensor = torch.from_numpy(state).float().unsqueeze(0)
            logits = policy_pg_baseline(state_tensor)
            probs = F.softmax(logits, dim=1).detach().numpy().ravel()
            action = np.random.choice(action_size, p=probs)

            next_state, reward, done, _ = step_env(env, action)

            episode_states.append(state)
            episode_actions.append(action)
            episode_rewards.append(reward)

            state = next_state

        # Episode summary
        episode_reward_sum = np.sum(episode_rewards)
        all_rewards.append(episode_reward_sum)
        mean_reward = np.mean(all_rewards)
        max_reward = np.max(all_rewards)

        print("==========================================")
        print("PG + Baseline - Episode:", episode)
        print("Reward:", episode_reward_sum)
        print("Mean reward:", mean_reward)
        print("Max reward so far:", max_reward)

        # Discounted returns (no normalization) for value network
        discounted_returns = discount_rewards(episode_rewards, gamma)

        # Tensors
        states_tensor = torch.tensor(episode_states, dtype=torch.float32)
        actions_tensor = torch.tensor(episode_actions, dtype=torch.long)
        returns_tensor = torch.tensor(discounted_returns, dtype=torch.float32)

        # Forward
        logits = policy_pg_baseline(states_tensor)
        values = value_baseline(states_tensor)  # shape [T]

        neg_log_prob = criterion_pg(logits, actions_tensor)  # -log Ï€(a|s)

        # Advantage = G_t - V(s_t); treat V(s) as baseline (no gradient in policy loss)
        advantages = returns_tensor - values.detach()
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # Policy loss (REINFORCE with baseline)
        policy_loss = torch.mean(neg_log_prob * advantages)

        # Value loss (fit V(s) to returns)
        value_loss = F.mse_loss(values, returns_tensor)

        # Total loss
        loss = policy_loss + value_loss_coef * value_loss

        # Optimize both networks
        optimizer_pg_baseline.zero_grad()
        loss.backward()
        optimizer_pg_baseline.step()

        # TensorBoard logging
        if writer_pg_baseline is not None:
            writer_pg_baseline.add_scalar("Loss/policy", policy_loss.item(), episode)
            writer_pg_baseline.add_scalar("Loss/value", value_loss.item(), episode)
            writer_pg_baseline.add_scalar("Reward_mean", mean_reward, episode)

        # Save periodically
        if episode % 100 == 0:
            torch.save(policy_pg_baseline.state_dict(), MODEL_PATH_PG_BASELINE)
            print("Saved PG+baseline model at episode", episode)

    # Final save
    torch.save(policy_pg_baseline.state_dict(), MODEL_PATH_PG_BASELINE)
    print("Final PG+baseline model saved.")
    if writer_pg_baseline is not None:
        writer_pg_baseline.flush()

    return all_rewards

# Train policy gradient with baseline
pg_baseline_rewards = train_policy_gradient_with_baseline(env,
                                                          num_episodes=max_episodes_pg_baseline,
                                                          render=False)



In [None]:
# Load & evaluate PG + baseline model
loaded_policy_pg_baseline = PolicyNetwork(state_size, action_size)
loaded_policy_pg_baseline.load_state_dict(
    torch.load(MODEL_PATH_PG_BASELINE, map_location=torch.device('cpu'))
)

pg_baseline_eval_rewards = evaluate_policy(env,
                                           loaded_policy_pg_baseline,
                                           num_episodes=10,
                                           render=False,
                                           title="PG + Baseline")

# Optional: comparison summary (if you've run all cells)
print("\n=== Summary over 10 evaluation episodes (if all agents were run) ===")
if 'random_rewards' in globals():
    print("Random agent average reward:      ", np.mean(random_rewards))
else:
    print("Random agent average reward:      (not run in this session)")
print("Vanilla PG average reward:       ", np.mean(pg_eval_rewards))
print("PG with baseline average reward: ", np.mean(pg_baseline_eval_rewards))

env.close()
