# Tutorial 16: Reinforcement Learning

This tutorial introduces reinforcement learning with PyTorch, covering fundamental algorithms and modern deep RL techniques.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from collections import deque, namedtuple
import random
import gym
from typing import Tuple, List, Optional
import math
from IPython import display

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Basic RL Environment

Let's start by creating a simple grid world environment to understand the basics of RL.

In [None]:
class GridWorld:
    """Simple grid world environment for RL"""
    def __init__(self, size=5):
        self.size = size
        self.reset()
        
    def reset(self):
        """Reset environment to initial state"""
        self.agent_pos = [0, 0]
        self.goal_pos = [self.size-1, self.size-1]
        self.done = False
        return self._get_state()
    
    def _get_state(self):
        """Get current state representation"""
        state = np.zeros((self.size, self.size))
        state[self.agent_pos[0], self.agent_pos[1]] = 1
        state[self.goal_pos[0], self.goal_pos[1]] = 2
        return state.flatten()
    
    def step(self, action):
        """Take action and return (next_state, reward, done)"""
        if self.done:
            return self._get_state(), 0, True
        
        # Actions: 0=up, 1=right, 2=down, 3=left
        moves = [(-1, 0), (0, 1), (1, 0), (0, -1)]
        move = moves[action]
        
        # Update position
        new_pos = [self.agent_pos[0] + move[0], self.agent_pos[1] + move[1]]
        
        # Check boundaries
        if 0 <= new_pos[0] < self.size and 0 <= new_pos[1] < self.size:
            self.agent_pos = new_pos
        
        # Check if goal reached
        if self.agent_pos == self.goal_pos:
            reward = 10
            self.done = True
        else:
            reward = -0.1  # Small negative reward for each step
        
        return self._get_state(), reward, self.done
    
    def render(self):
        """Visualize the environment"""
        grid = np.zeros((self.size, self.size))
        grid[self.agent_pos[0], self.agent_pos[1]] = 1
        grid[self.goal_pos[0], self.goal_pos[1]] = 2
        
        plt.figure(figsize=(5, 5))
        plt.imshow(grid, cmap='viridis', interpolation='nearest')
        plt.colorbar(label='0: Empty, 1: Agent, 2: Goal')
        plt.title('Grid World Environment')
        plt.xticks(range(self.size))
        plt.yticks(range(self.size))
        plt.grid(True, alpha=0.3)
        plt.show()

In [None]:
# Test the environment
env = GridWorld(size=5)
state = env.reset()
print("Initial state shape:", state.shape)
print("\nInitial grid:")
env.render()

# Take a few random actions
print("\nTaking random actions:")
for i in range(5):
    action = np.random.randint(4)
    next_state, reward, done = env.step(action)
    action_names = ['Up', 'Right', 'Down', 'Left']
    print(f"Action: {action_names[action]}, Reward: {reward:.1f}, Done: {done}")
    
print("\nFinal grid:")
env.render()

## 2. Deep Q-Network (DQN)

DQN uses a neural network to approximate Q-values for each action in each state.

In [None]:
class DQN(nn.Module):
    """Deep Q-Network"""
    def __init__(self, state_size, action_size, hidden_size=128):
        super().__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Experience replay buffer
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))

class ReplayBuffer:
    """Experience replay buffer for DQN"""
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, *args):
        """Save a transition"""
        self.buffer.append(Transition(*args))
    
    def sample(self, batch_size):
        """Sample a batch of transitions"""
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

In [None]:
class DQNAgent:
    """DQN Agent with experience replay and target network"""
    def __init__(self, state_size, action_size, lr=1e-3, gamma=0.99, 
                 epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        # Neural networks
        self.q_network = DQN(state_size, action_size).to(device)
        self.target_network = DQN(state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        
        # Experience replay
        self.memory = ReplayBuffer()
        
        # Update target network
        self.update_target_network()
        
    def update_target_network(self):
        """Copy weights from main network to target network"""
        self.target_network.load_state_dict(self.q_network.state_dict())
    
    def act(self, state, training=True):
        """Choose action using epsilon-greedy policy"""
        if training and random.random() < self.epsilon:
            return random.randint(0, self.action_size - 1)
        
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = self.q_network(state_tensor)
        return q_values.argmax().item()
    
    def remember(self, state, action, next_state, reward, done):
        """Store transition in replay buffer"""
        self.memory.push(state, action, next_state, reward, done)
    
    def replay(self, batch_size=32):
        """Train the network on a batch of transitions"""
        if len(self.memory) < batch_size:
            return
        
        transitions = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))
        
        # Convert to tensors
        state_batch = torch.FloatTensor(batch.state).to(device)
        action_batch = torch.LongTensor(batch.action).to(device)
        reward_batch = torch.FloatTensor(batch.reward).to(device)
        next_state_batch = torch.FloatTensor(batch.next_state).to(device)
        done_batch = torch.FloatTensor(batch.done).to(device)
        
        # Current Q values
        current_q_values = self.q_network(state_batch).gather(1, action_batch.unsqueeze(1))
        
        # Next Q values from target network
        next_q_values = self.target_network(next_state_batch).max(1)[0].detach()
        target_q_values = reward_batch + (self.gamma * next_q_values * (1 - done_batch))
        
        # Compute loss
        loss = F.mse_loss(current_q_values.squeeze(), target_q_values)
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Update epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
        return loss.item()

In [None]:
# Train DQN on GridWorld
env = GridWorld(size=5)
state_size = env.size * env.size
action_size = 4

agent = DQNAgent(state_size, action_size)

# Training loop
episodes = 300
scores = []
losses = []
epsilons = []

print("Training DQN...")
for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    steps = 0
    
    while True:
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        agent.remember(state, action, next_state, reward, done)
        
        if len(agent.memory) > 32:
            loss = agent.replay()
            if loss:
                losses.append(loss)
        
        state = next_state
        total_reward += reward
        steps += 1
        
        if done or steps > 100:
            break
    
    scores.append(total_reward)
    epsilons.append(agent.epsilon)
    
    # Update target network
    if episode % 10 == 0:
        agent.update_target_network()
    
    if episode % 50 == 0:
        avg_score = np.mean(scores[-50:]) if len(scores) >= 50 else np.mean(scores)
        print(f"Episode {episode}, Average Score: {avg_score:.2f}, Epsilon: {agent.epsilon:.3f}")

In [None]:
# Visualize DQN training
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Plot scores
axes[0, 0].plot(scores, alpha=0.6)
axes[0, 0].plot(np.convolve(scores, np.ones(20)/20, mode='valid'), linewidth=2)
axes[0, 0].set_title('DQN Training Scores')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Score')
axes[0, 0].grid(True, alpha=0.3)

# Plot losses
if losses:
    axes[0, 1].plot(losses, alpha=0.6)
    axes[0, 1].set_title('DQN Training Loss')
    axes[0, 1].set_xlabel('Training Step')
    axes[0, 1].set_ylabel('Loss')
    axes[0, 1].set_yscale('log')
    axes[0, 1].grid(True, alpha=0.3)

# Plot epsilon decay
axes[1, 0].plot(epsilons)
axes[1, 0].set_title('Epsilon Decay')
axes[1, 0].set_xlabel('Episode')
axes[1, 0].set_ylabel('Epsilon')
axes[1, 0].grid(True, alpha=0.3)

# Test the trained agent
test_env = GridWorld(size=5)
state = test_env.reset()
path = [test_env.agent_pos.copy()]

for _ in range(20):
    action = agent.act(state, training=False)
    state, reward, done = test_env.step(action)
    path.append(test_env.agent_pos.copy())
    if done:
        break

# Visualize learned path
grid = np.zeros((5, 5))
for i, pos in enumerate(path):
    grid[pos[0], pos[1]] = i + 1
grid[4, 4] = len(path) + 1  # Goal

im = axes[1, 1].imshow(grid, cmap='viridis')
axes[1, 1].set_title('Learned Path')
axes[1, 1].set_xlabel('X')
axes[1, 1].set_ylabel('Y')
plt.colorbar(im, ax=axes[1, 1], label='Step')

plt.tight_layout()
plt.show()

## 3. Policy Gradient - REINFORCE

REINFORCE learns a policy directly by maximizing expected rewards.

In [None]:
class PolicyNetwork(nn.Module):
    """Policy network for REINFORCE"""
    def __init__(self, state_size, action_size, hidden_size=128):
        super().__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=-1)

class REINFORCEAgent:
    """REINFORCE policy gradient agent"""
    def __init__(self, state_size, action_size, lr=1e-3, gamma=0.99):
        self.gamma = gamma
        self.policy = PolicyNetwork(state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.saved_log_probs = []
        self.rewards = []
        
    def act(self, state):
        """Select action from policy"""
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        probs = self.policy(state_tensor)
        m = torch.distributions.Categorical(probs)
        action = m.sample()
        self.saved_log_probs.append(m.log_prob(action))
        return action.item()
    
    def store_reward(self, reward):
        """Store reward for current episode"""
        self.rewards.append(reward)
    
    def train(self):
        """Update policy using collected rewards"""
        R = 0
        policy_loss = []
        returns = []
        
        # Calculate discounted returns
        for r in self.rewards[::-1]:
            R = r + self.gamma * R
            returns.insert(0, R)
        
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)
        
        # Calculate loss
        for log_prob, R in zip(self.saved_log_probs, returns):
            policy_loss.append(-log_prob * R)
        
        # Update policy
        self.optimizer.zero_grad()
        policy_loss = torch.cat(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()
        
        # Clear episode data
        self.saved_log_probs = []
        self.rewards = []
        
        return policy_loss.item()

In [None]:
# Train REINFORCE
env = GridWorld(size=5)
reinforce_agent = REINFORCEAgent(state_size, action_size)
scores = []
losses = []

print("Training REINFORCE...")
for episode in range(300):
    state = env.reset()
    
    while True:
        action = reinforce_agent.act(state)
        next_state, reward, done = env.step(action)
        reinforce_agent.store_reward(reward)
        state = next_state
        
        if done:
            break
    
    loss = reinforce_agent.train()
    losses.append(loss)
    scores.append(sum(reinforce_agent.rewards))
    
    if episode % 50 == 0:
        avg_score = np.mean(scores[-50:]) if len(scores) >= 50 else np.mean(scores)
        print(f"Episode {episode}, Average Score: {avg_score:.2f}")

## 4. Actor-Critic (A2C)

Actor-Critic methods combine the benefits of value-based and policy-based approaches.

In [None]:
class ActorCritic(nn.Module):
    """Combined Actor-Critic network"""
    def __init__(self, state_size, action_size, hidden_size=128):
        super().__init__()
        # Shared layers
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        
        # Actor head
        self.actor = nn.Linear(hidden_size, action_size)
        
        # Critic head
        self.critic = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        # Actor output (action probabilities)
        action_probs = F.softmax(self.actor(x), dim=-1)
        
        # Critic output (state value)
        state_value = self.critic(x)
        
        return action_probs, state_value

class A2CAgent:
    """Advantage Actor-Critic agent"""
    def __init__(self, state_size, action_size, lr=1e-3, gamma=0.99):
        self.gamma = gamma
        self.actor_critic = ActorCritic(state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=lr)
        
    def act(self, state):
        """Select action and return value estimate"""
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        
        with torch.no_grad():
            action_probs, _ = self.actor_critic(state_tensor)
        
        m = torch.distributions.Categorical(action_probs)
        action = m.sample()
        
        return action.item()
    
    def train_step(self, state, action, reward, next_state, done):
        """Single step training update"""
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0).to(device)
        
        # Get current predictions
        action_probs, value = self.actor_critic(state_tensor)
        
        # Get next state value
        with torch.no_grad():
            _, next_value = self.actor_critic(next_state_tensor)
            target_value = reward + self.gamma * next_value * (1 - done)
        
        # Calculate advantage
        advantage = target_value - value
        
        # Actor loss
        m = torch.distributions.Categorical(action_probs)
        actor_loss = -m.log_prob(torch.tensor(action).to(device)) * advantage.detach()
        
        # Critic loss
        critic_loss = F.mse_loss(value, target_value.detach())
        
        # Total loss
        loss = actor_loss + critic_loss
        
        # Update
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss.item()

In [None]:
# Train A2C
env = GridWorld(size=5)
a2c_agent = A2CAgent(state_size, action_size)
scores = []
losses = []

print("Training A2C...")
for episode in range(300):
    state = env.reset()
    total_reward = 0
    episode_losses = []
    
    while True:
        action = a2c_agent.act(state)
        next_state, reward, done = env.step(action)
        
        loss = a2c_agent.train_step(state, action, reward, next_state, done)
        episode_losses.append(loss)
        
        state = next_state
        total_reward += reward
        
        if done:
            break
    
    scores.append(total_reward)
    losses.extend(episode_losses)
    
    if episode % 50 == 0:
        avg_score = np.mean(scores[-50:]) if len(scores) >= 50 else np.mean(scores)
        print(f"Episode {episode}, Average Score: {avg_score:.2f}")

## 5. Comparison of Algorithms

Let's compare the performance of different RL algorithms.

In [None]:
# Train all algorithms and compare
algorithms = ['DQN', 'REINFORCE', 'A2C']
all_scores = {}

# Train each algorithm
for algo in algorithms:
    env = GridWorld(size=5)
    scores = []
    
    if algo == 'DQN':
        agent = DQNAgent(state_size, action_size)
    elif algo == 'REINFORCE':
        agent = REINFORCEAgent(state_size, action_size)
    else:  # A2C
        agent = A2CAgent(state_size, action_size)
    
    print(f"\nTraining {algo}...")
    for episode in range(200):
        state = env.reset()
        total_reward = 0
        
        if algo == 'REINFORCE':
            while True:
                action = agent.act(state)
                next_state, reward, done = env.step(action)
                agent.store_reward(reward)
                state = next_state
                total_reward += reward
                if done:
                    break
            agent.train()
        elif algo == 'DQN':
            while True:
                action = agent.act(state)
                next_state, reward, done = env.step(action)
                agent.remember(state, action, next_state, reward, done)
                if len(agent.memory) > 32:
                    agent.replay()
                state = next_state
                total_reward += reward
                if done:
                    break
        else:  # A2C
            while True:
                action = agent.act(state)
                next_state, reward, done = env.step(action)
                agent.train_step(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward
                if done:
                    break
        
        scores.append(total_reward)
    
    all_scores[algo] = scores

In [None]:
# Visualize comparison
plt.figure(figsize=(12, 6))

for algo, scores in all_scores.items():
    smoothed = np.convolve(scores, np.ones(20)/20, mode='valid')
    plt.plot(smoothed, label=algo, linewidth=2)

plt.xlabel('Episode')
plt.ylabel('Average Score (20 episode window)')
plt.title('RL Algorithm Comparison on GridWorld')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Print final statistics
print("\nFinal Performance (last 50 episodes):")
for algo, scores in all_scores.items():
    final_avg = np.mean(scores[-50:])
    final_std = np.std(scores[-50:])
    print(f"{algo}: {final_avg:.2f} ± {final_std:.2f}")

## 6. Advanced Concepts: Continuous Control

For continuous action spaces, we need different approaches like DDPG or PPO.

In [None]:
# Continuous control environment simulation
class ContinuousControlEnv:
    """Simple continuous control environment"""
    def __init__(self):
        self.state = None
        self.reset()
        
    def reset(self):
        self.state = np.random.uniform(-1, 1, size=2)
        return self.state.copy()
    
    def step(self, action):
        # Apply action (continuous)
        self.state += action * 0.1
        
        # Reward is negative distance from origin
        reward = -np.linalg.norm(self.state)
        
        # Episode ends if too far from origin
        done = np.linalg.norm(self.state) > 2.0
        
        return self.state.copy(), reward, done

# DDPG Networks
class Actor(nn.Module):
    """Actor network for continuous actions"""
    def __init__(self, state_dim, action_dim, max_action):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, action_dim)
        self.max_action = max_action
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        return x * self.max_action

class Critic(nn.Module):
    """Critic network for Q-value estimation"""
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)
        
    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
# Visualize continuous control task
env = ContinuousControlEnv()
states = []
actions = []

# Random policy
state = env.reset()
for _ in range(50):
    action = np.random.uniform(-1, 1, size=2)
    states.append(state.copy())
    actions.append(action)
    state, reward, done = env.step(action)
    if done:
        break

states = np.array(states)

# Plot trajectory
plt.figure(figsize=(8, 8))
plt.plot(states[:, 0], states[:, 1], 'b-', alpha=0.6, label='Trajectory')
plt.scatter(states[0, 0], states[0, 1], c='green', s=100, marker='o', label='Start')
plt.scatter(states[-1, 0], states[-1, 1], c='red', s=100, marker='x', label='End')
plt.scatter(0, 0, c='gold', s=200, marker='*', label='Goal')
plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Continuous Control Task')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Summary and Best Practices

Let's summarize the key concepts and provide practical guidance.

In [None]:
# Create a summary visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Algorithm comparison
algorithms = ['DQN', 'REINFORCE', 'A2C', 'DDPG', 'PPO']
properties = ['Sample\nEfficiency', 'Stability', 'Continuous\nActions', 'Discrete\nActions']
scores = np.array([
    [4, 3, 1, 5],  # DQN
    [2, 2, 3, 4],  # REINFORCE
    [3, 4, 3, 4],  # A2C
    [4, 3, 5, 1],  # DDPG
    [4, 5, 5, 5],  # PPO
])

im = axes[0, 0].imshow(scores, cmap='YlOrRd', aspect='auto')
axes[0, 0].set_xticks(range(len(properties)))
axes[0, 0].set_xticklabels(properties)
axes[0, 0].set_yticks(range(len(algorithms)))
axes[0, 0].set_yticklabels(algorithms)
axes[0, 0].set_title('Algorithm Properties (1-5 scale)')
plt.colorbar(im, ax=axes[0, 0])

# Add text annotations
for i in range(len(algorithms)):
    for j in range(len(properties)):
        axes[0, 0].text(j, i, str(scores[i, j]), ha='center', va='center')

# Action space comparison
action_types = ['Discrete', 'Continuous']
algo_support = {
    'DQN': [1, 0],
    'REINFORCE': [1, 0.7],
    'A2C': [1, 0.7],
    'DDPG': [0, 1],
    'PPO': [1, 1]
}

x = np.arange(len(algorithms))
width = 0.35

for i, action_type in enumerate(action_types):
    values = [algo_support[algo][i] for algo in algorithms]
    axes[0, 1].bar(x + i*width - width/2, values, width, label=action_type)

axes[0, 1].set_xlabel('Algorithm')
axes[0, 1].set_ylabel('Support Level')
axes[0, 1].set_title('Action Space Support')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(algorithms)
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Sample complexity
sample_complexity = {
    'DQN': 1000,
    'REINFORCE': 5000,
    'A2C': 2000,
    'DDPG': 1500,
    'PPO': 1200
}

axes[1, 0].bar(algorithms, sample_complexity.values(), color='skyblue')
axes[1, 0].set_ylabel('Episodes to Convergence')
axes[1, 0].set_title('Sample Complexity (Lower is Better)')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Best practices text
axes[1, 1].text(0.1, 0.9, 'RL Best Practices:', fontsize=14, fontweight='bold', transform=axes[1, 1].transAxes)
practices = [
    '1. Start simple: Use DQN for discrete, DDPG for continuous',
    '2. Normalize observations and rewards',
    '3. Use experience replay for off-policy methods',
    '4. Implement proper exploration strategies',
    '5. Monitor multiple metrics during training',
    '6. Use stable baselines for production',
    '7. Consider PPO for robust performance',
    '8. Debug with simple environments first'
]

for i, practice in enumerate(practices):
    axes[1, 1].text(0.1, 0.8 - i*0.1, practice, fontsize=11, transform=axes[1, 1].transAxes)

axes[1, 1].axis('off')

plt.tight_layout()
plt.show()

## Conclusion

In this tutorial, we covered:

1. **Deep Q-Networks (DQN)**: Value-based method for discrete actions
2. **REINFORCE**: Simple policy gradient method
3. **Actor-Critic (A2C)**: Combining value and policy methods
4. **Continuous Control**: DDPG for continuous action spaces
5. **Best Practices**: Practical tips for successful RL

### Key Takeaways:
- RL is about learning from interaction
- Different algorithms suit different problems
- Exploration vs exploitation is crucial
- Start simple and scale up gradually
- Modern methods like PPO offer good stability

Reinforcement learning is a powerful paradigm for solving sequential decision-making problems!