In [None]:
import numpy as np
import pandas as pd

# 1. Simulator with Fixed Randomness (PEGASUS Concept)
# Reference: Stanford CS229 Lecture 20 
def simulator(theta, seed):
    np.random.seed(seed)
    # Simulate a noisy control task (e.g., keeping a vehicle in lane)
    noise = np.random.normal(0, 0.1)
    # Reward is a non-linear function of theta
    reward = - (theta - 0.8)**2 + noise
    return reward

# 2. Policy Evaluation with Fixed Seeds
def evaluate_policy(theta, num_seeds=5):
    rewards = [simulator(theta, seed=i) for i in range(num_seeds)]
    return np.mean(rewards)

# 3. Simple Policy Search (Gradient Approximation)
theta = 0.1 # Initial policy parameter
alpha = 0.05 # Learning rate
epsilon = 1e-3

for i in range(100):
    # Finite difference gradient estimation
    grad = (evaluate_policy(theta + epsilon) - evaluate_policy(theta)) / epsilon
    theta += alpha * grad

print(f"Optimal Policy Parameter (PEGASUS Optimized): {theta:.4f}")