In [None]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import os

# Import your custom pendulum environment
from quad_copter import QuadcopterEnv

# Gymnasium wrapper for your custom environment
class GymQuadcopterEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.env = QuadcopterEnv()
        
        # Define action and observation spaces
        # Action space: single continuous action (force)
        self.action_space = gym.spaces.Box(
            low=-10.0, high=10.0, shape=(1,), dtype=np.float32
        )
        high = np.array([2.4, np.inf, np.pi + 0.2, np.inf], dtype=np.float32)
        low = np.array([-2.4, -np.inf, np.pi - 0.2, -np.inf], dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32)
    
    def reset(self, seed=None, options=None):
        if seed is not None:
            np.random.seed(seed)
        obs = self.env.reset()
        return obs.astype(np.float32), {}
    
    def step(self, action):
        # Extract force from action array
        force = float(action[0])
        obs, reward, done, info = self.env.step(force)
        return obs.astype(np.float32), float(reward), done, False, info
    
    def render(self):
        return self.env.render()

# logging
log_dir = "./logs/"
os.makedirs(log_dir, exist_ok=True)

SEED = 42
np.random.seed(SEED)

# create custom environment function
def make_env():
    env = GymQuadcopterEnv()
    env = Monitor(env, filename=os.path.join(log_dir, "monitor.csv"))
    return env

# Create training environment
train_env = DummyVecEnv([make_env])

n_actions = train_env.action_space.shape[0]

action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.5 * np.ones(n_actions))

# Create TD3 agent
model = TD3(
    "MlpPolicy",
    train_env,
    action_noise=action_noise,
    verbose=1,
    seed=SEED,
    learning_rate=0.001,
    buffer_size=100000,
    batch_size=100,
    learning_starts=1000,
    policy_delay=2,
    target_policy_noise=0.2,
    target_noise_clip=0.5,
    tau=0.005,
    gamma=0.99,
)

print("Starting training...")
model.learn(total_timesteps=50000)
model.save("td3_inverted_pendulum")
print("Training completed and model saved!")

train_env.close()

# Evaluation
print("\nEvaluating trained agent...")
eval_env = DummyVecEnv([make_env])

# Use stable-baselines3's built-in evaluation function
mean_reward, std_reward = evaluate_policy(
    model, 
    eval_env, 
    n_eval_episodes=10,
    deterministic=True,
    render=False
)

print(f"Mean evaluation reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# Additional detailed evaluation
obs = eval_env.reset()
episode_rewards = []
episode_lengths = []

for episode in range(5):
    obs = eval_env.reset()
    total_reward = 0
    steps = 0
    
    for step in range(200):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        total_reward += reward[0]
        steps += 1
        
        if done[0]:
            break
    
    episode_rewards.append(total_reward)
    episode_lengths.append(steps)
    print(f"Episode {episode + 1}: Reward = {total_reward:.2f}, Steps = {steps}")

eval_env.close()

# Summary statistics and logging
print(f"\nEvaluation Summary:")
print(f"Average Reward: {np.mean(episode_rewards):.2f}")
print(f"Std Reward: {np.std(episode_rewards):.2f}")
print(f"Min Reward: {np.min(episode_rewards):.2f}")
print(f"Max Reward: {np.max(episode_rewards):.2f}")
print(f"Average Episode Length: {np.mean(episode_lengths):.1f}")

try:
    import pandas as pd
    monitor_data = pd.read_csv(os.path.join(log_dir, "monitor.csv"), skiprows=1)
    if len(monitor_data) > 0:
        print(f"\nTraining Statistics:")
        print(f"Training Episodes: {len(monitor_data)}")
        print(f"Average Training Reward: {monitor_data['r'].mean():.2f}")
        print(f"Final Training Reward: {monitor_data['r'].iloc[-1]:.2f}")
        print(f"Best Training Reward: {monitor_data['r'].max():.2f}")
except Exception as e:
    print(f"Could not load training statistics: {e}")