## 1. Import Required Libraries

In [1]:
import gymnasium as gym
import numpy as np
import torch
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv

# Import custom modules
from customization import *
from learning import Driver

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

  from pkg_resources import resource_stream, resource_exists


PyTorch version: 2.6.0+cu124
CUDA available: False


## 2. Configuration

In [3]:
# Training configuration
NUM_ENVS = 2  # Number of parallel training environments
TOTAL_TIMESTEPS = 100_000  # Total training steps
SAVE_FREQ = 10_000  # Save checkpoint every N steps
EVAL_FREQ = SAVE_FREQ  # Evaluate every N steps
N_EVAL_EPISODES = 5  # Number of episodes per evaluation

# Directories
CHECKPOINT_DIR = "./modified_models/checkpoints/"
LOG_DIR = "./logs/modified/"
BEST_MODEL_DIR = "./modified_models/best_model/"

print(f"Configuration:")
print(f"  Training environments: {NUM_ENVS}")
print(f"  Total timesteps: {TOTAL_TIMESTEPS:,}")
print(f"  Save frequency: {SAVE_FREQ:,}")
print(f"  Eval frequency: {EVAL_FREQ:,}")

Configuration:
  Training environments: 2
  Total timesteps: 100,000
  Save frequency: 10,000
  Eval frequency: 10,000


## 3. Create Vectorized Custom Environments

In [3]:
train_env = make_vec_envs(num_envs=NUM_ENVS)

print(f"Created {NUM_ENVS} training environments")
print(f"Observation space: {train_env.observation_space}")
print(f"Action space: {train_env.action_space}")

Created 2 training environments
Observation space: Box(0, 255, (84, 96, 3), uint8)
Action space: Box([-1.  0.  0.], 1.0, (3,), float32)


## 4. Create Evaluation Environment

In [5]:
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage

# Create evaluation environment
# Usamos a mesma factory, mas precisamos aplicar manualmente os wrappers
# que o Driver aplica internamente no treino (Transpose + FrameStack).
eval_env = make_vec_envs(num_envs=1)

# 1. Transpose from (H, W, C) -> (C, H, W)
# Isso coloca os canais de cor no início, padrão PyTorch/SB3
eval_env = VecTransposeImage(eval_env)

# 2. Stack Frames
# O erro "expected (12, 84, 96)" indica 4 frames de 3 canais (4*3=12)
eval_env = VecFrameStack(eval_env, n_stack=4)

print(f"Created evaluation environment")
print(f"Eval Observation space: {eval_env.observation_space}") 
# Deve imprimir: Box(0, 255, (12, 84, 96), uint8)

Created evaluation environment
Eval Observation space: Box(0, 255, (12, 84, 96), uint8)


## 5. Initialize Driver with PPO

In [6]:
# Create the Driver (wraps PPO model with training logic)
driver = Driver(
    vec_env=train_env,
    eval_env=eval_env,
)

print(f"Driver initialized successfully!")
print(f"Model device: {driver.model.device}")
print(f"VecEnv has {driver.vec_env.num_envs} environments (includes FrameStack wrapper)")

Driver initialized successfully!
Model device: cuda
VecEnv has 2 environments (includes FrameStack wrapper)


## 6. Train the Model from Scratch

In [7]:
# Train the model
print("Starting training...")
print(f"Training for {TOTAL_TIMESTEPS:,} timesteps")
print(f"Checkpoints will be saved to: {CHECKPOINT_DIR}")
print(f"Best model will be saved to: {BEST_MODEL_DIR}")
print(f"TensorBoard logs: {LOG_DIR}")
print("\nTo monitor training, run in a terminal:")
print(f"  tensorboard --logdir {LOG_DIR}")
print("\n" + "="*50)

driver.train()

print("\n" + "="*50)
print("Training completed!")

Starting training...
Training for 100,000 timesteps
Checkpoints will be saved to: ./modified_models/checkpoints/
Best model will be saved to: ./modified_models/best_model/
TensorBoard logs: ./logs/modified/

To monitor training, run in a terminal:
  tensorboard --logdir ./logs/modified/



Output()




Training completed!


## 7. Resume Training from Checkpoint (Optional)

Use this cell if you want to resume training from the latest checkpoint.

In [None]:
# Resume training from latest checkpoint
# You can change the number of workers if needed

NUM_ENVS_RESUME = 4  # Optional: change number of workers for resumed training

driver.resume_training(
    target_steps=TOTAL_TIMESTEPS,
    num_envs=NUM_ENVS_RESUME  # Set to None to keep same number of envs
)

## 8. Save Final Model

In [12]:
# Save the final trained model
final_model_path = f"{BEST_MODEL_DIR}/ppo_custom_env_final.zip"
driver.save(final_model_path)
print(f"Final model saved to: {final_model_path}")

Model saved to: ./modified_models/best_model//ppo_custom_env_final.zip
Final model saved to: ./modified_models/best_model//ppo_custom_env_final.zip




## 9. Close Environments

In [9]:
# Close all environments
train_env.close()
eval_env.close()

print("All environments closed successfully!")

All environments closed successfully!


## 10. Test the Trained Model (Optional)

Visualize the trained agent playing the game.

In [15]:
import cv2
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage

# 1. Load the best model
best_model_path = f"{BEST_MODEL_DIR}/ppo_custom_env_final.zip"
# Se der erro de arquivo não encontrado, tente o final:
# best_model_path = "./modified_models/ppo_custom_env_final.zip"

print(f"Loading model from: {best_model_path}")
model = PPO.load(best_model_path)

# 2. Create test env & Apply Wrappers manually
test_env = make_vec_envs(num_envs=1)
test_env = VecTransposeImage(test_env)
test_env = VecFrameStack(test_env, n_stack=4)

NUM_TEST_EPISODES = 3

try:
    for episode in range(NUM_TEST_EPISODES):
        obs = test_env.reset()
        done = False
        total_reward = 0
        step_count = 0
        
        print(f"\nEpisode {episode + 1}/{NUM_TEST_EPISODES}")
        
        while not done:
            # Predict action
            action, _states = model.predict(obs, deterministic=True)
            
            # Step environment
            obs, reward, done_array, info = test_env.step(action)
            done = done_array[0] # VecEnv retorna array
            total_reward += reward[0]
            step_count += 1
            
            # --- VISUALIZATION LOGIC ---
            # Obs shape agora é (1, 12, 84, 96) -> (Batch, Channels, H, W)
            
            # 1. Pegar o primeiro do batch e Transpor de volta para (H, W, C) para o OpenCV
            # shape[0] é 12 (canais). Transpose (1, 2, 0) -> (84, 96, 12)
            agent_view = np.transpose(obs[0], (1, 2, 0))
            
            # 2. Pegar apenas os últimos 3 canais (frame mais recente)
            current_frame_rgb = agent_view[:, :, -3:]
            
            # 3. Converter RGB para BGR
            frame_bgr = cv2.cvtColor(current_frame_rgb.astype(np.uint8), cv2.COLOR_RGB2BGR)
            
            # 4. Upscale
            frame_upscaled = cv2.resize(frame_bgr, (480, 420), interpolation=cv2.INTER_NEAREST)
            
            cv2.imshow("Agent View (Stacked)", frame_upscaled)
            
            if cv2.waitKey(1) & 0xFF == ord('q'):
                done = True
                break
        
        print(f"  Steps: {step_count}")
        print(f"  Total Reward: {total_reward:.2f}")
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

except KeyboardInterrupt:
    print("\nInterrupted by user")
finally:
    test_env.close()
    cv2.destroyAllWindows()
    print("Testing complete!")

Loading model from: ./modified_models/best_model//ppo_custom_env_final.zip

Episode 1/3

Episode 1/3
  Steps: 1000
  Total Reward: 527.33
  Steps: 1000
  Total Reward: 527.33

Episode 2/3

Episode 2/3
  Steps: 1000
  Total Reward: 527.37
  Steps: 1000
  Total Reward: 527.37

Episode 3/3

Episode 3/3
  Steps: 1000
  Total Reward: 532.88
  Steps: 1000
  Total Reward: 532.88
Testing complete!
Testing complete!


## Next Steps

- Monitor training with TensorBoard: `tensorboard --logdir ./logs/modified/`
- Compare performance with base model (train_base_model.ipynb)
- Tune reward hyperparameters in config.py
- Analyze the effect of optimal line following
- Test different racing tracks

## 11. Play with Best Model

Load and visualize the best model from training (saved by EvalCallback).

In [2]:
import cv2
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage

# Load the best model (saved by EvalCallback during training)
best_model_path = f"{BEST_MODEL_DIR}/ppo_custom_env_final"

print(f"Loading best model from: {best_model_path}")
model = PPO.load(best_model_path)

# Create test environment with proper wrappers
test_env = make_vec_envs(num_envs=1)
test_env = VecTransposeImage(test_env)
test_env = VecFrameStack(test_env, n_stack=4)

NUM_EPISODES = 5

print(f"\nPlaying {NUM_EPISODES} episodes with best model...")
print("Press 'q' to quit\n")

try:
    for episode in range(NUM_EPISODES):
        obs = test_env.reset()
        done = False
        total_reward = 0
        step_count = 0
        
        print(f"Episode {episode + 1}/{NUM_EPISODES}", end=" ", flush=True)
        
        while not done:
            # Predict action
            action, _states = model.predict(obs, deterministic=True)
            
            # Step environment
            obs, reward, done_array, info = test_env.step(action)
            done = done_array[0]
            total_reward += reward[0]
            step_count += 1
            
            # Visualize: obs shape is (1, 12, 84, 96) -> (Batch, Channels, H, W)
            # Transpose to (H, W, C) and take last 3 channels (current frame)
            agent_view = np.transpose(obs[0], (1, 2, 0))[:, :, -3:]
            
            # Convert RGB to BGR and upscale
            frame_bgr = cv2.cvtColor(agent_view.astype(np.uint8), cv2.COLOR_RGB2BGR)
            frame_upscaled = cv2.resize(frame_bgr, (480, 420), interpolation=cv2.INTER_NEAREST)
            
            cv2.imshow("Best Model Playing", frame_upscaled)
            
            if cv2.waitKey(1) & 0xFF == ord('q'):
                done = True
                break
        
        print(f"- Steps: {step_count:4d} | Reward: {total_reward:7.2f}")
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

except KeyboardInterrupt:
    print("\nInterrupted by user")
finally:
    test_env.close()
    cv2.destroyAllWindows()
    print("\nPlayback complete!")

NameError: name 'BEST_MODEL_DIR' is not defined

In [6]:
import cv2
import numpy as np
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage
from gymnasium.wrappers import RecordVideo

# Create video directory based on training timesteps
video_subfolder = f"videos/{TOTAL_TIMESTEPS}"
os.makedirs(video_subfolder, exist_ok=True)

print(f"Videos will be saved to: {video_subfolder}")

# Load the best model
best_model_path = f"{BEST_MODEL_DIR}/ppo_custom_env_final.zip"
print(f"Loading model from: {best_model_path}")
model = PPO.load(best_model_path)

# Number of episodes to record
NUM_RECORD_EPISODES = 3

# Record episodes
for episode_idx in range(NUM_RECORD_EPISODES):
    print(f"\nRecording episode {episode_idx + 1}/{NUM_RECORD_EPISODES}...")
    
    # Create environment with RecordVideo wrapper
    env = gym.make("CarRacing-v3", render_mode="rgb_array")
    env = CustomEnvironment(env)
    
    # Add RecordVideo wrapper
    env = RecordVideo(
        env,
        video_folder=video_subfolder,
        name_prefix=f"episode_{episode_idx + 1}",
        episode_trigger=lambda x: True  # Record this episode
    )
    
    # Reset environment
    obs, info = env.reset(seed=1000 + episode_idx)
    
    # Manual Transpose and FrameStack to match training environment
    # 1. Transpose (H, W, C) -> (C, H, W)
    obs = np.transpose(obs, (2, 0, 1))
    
    # 2. Stack frames: Create initial stack by repeating the first frame
    # Shape becomes (12, 84, 96)
    stacked_obs = np.concatenate([obs] * 4, axis=0)
    
    done = False
    total_reward = 0
    step_count = 0
    
    while not done:
        # Predict action
        # Model expects (12, 84, 96)
        action, _states = model.predict(stacked_obs, deterministic=True)
        
        # Step environment
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        # Update frame stack
        # 1. Transpose new observation
        obs = np.transpose(obs, (2, 0, 1))
        
        # 2. Shift stack: remove oldest frame (first 3 channels), add new frame (last 3 channels)
        stacked_obs = np.concatenate([stacked_obs[3:], obs], axis=0)
        
        total_reward += reward
        step_count += 1
    
    env.close()
    
    print(f"  Steps: {step_count:4d} | Reward: {total_reward:7.2f}")

print(f"\n✓ All videos saved to: {video_subfolder}")
print(f"Total episodes recorded: {NUM_RECORD_EPISODES}")

Videos will be saved to: videos/100000
Loading model from: ./modified_models/best_model//ppo_custom_env_final.zip

Recording episode 1/3...

Recording episode 1/3...


  logger.warn("Unable to save last video! Did you call close()?")


  Steps: 1000 | Reward:  526.18

Recording episode 2/3...
  Steps: 1000 | Reward:  527.91

Recording episode 3/3...
  Steps: 1000 | Reward:  527.91

Recording episode 3/3...
  Steps: 1000 | Reward:  531.06

✓ All videos saved to: videos/100000
Total episodes recorded: 3
  Steps: 1000 | Reward:  531.06

✓ All videos saved to: videos/100000
Total episodes recorded: 3


## 12. Record Videos of Model Performance

Record videos of the trained model playing and save them to the videos folder.