In [None]:
# Application 5-1: Train a Robot to Cross a Room
# rl_grid_simulation.ipynb
# Cell 1: Setup and Initialization

import random
from IPython.display import display, HTML

# --- ENVIRONMENT PARAMETERS Task 1: Increase the State Space (Grid Size)---
GRID_SIZE = 5
START_POS = (0, 0)
GOAL_POS = (GRID_SIZE - 1, GRID_SIZE - 1)  # (4, 4)
MAX_STEPS = 30 # Limit steps per episode for demonstration

# --- ACTIONS Task 3: Restrict the Action Space---
# (dr, dc) = change in (row, column)
ACTIONS = {
    'UP': (-1, 0),
    'DOWN': (1, 0),
    'LEFT': (0, -1),
    'RIGHT': (0, 1)
}
ACTION_KEYS = list(ACTIONS.keys())

# --- REWARDS Task 2: Change the Reward Function (Dramatically Increase Penalty)---
REWARDS = {
    'GOAL': +10,
    'MOVE': -1,
    'WALL_HIT': -5
}

print("RL Environment Setup Complete:")
print(f"Grid Size: {GRID_SIZE}x{GRID_SIZE}")
print(f"Start: {START_POS}, Goal: {GOAL_POS}")
print(f"Rewards: Goal={REWARDS['GOAL']}, Move={REWARDS['MOVE']}, Wall Hit={REWARDS['WALL_HIT']}")

RL Environment Setup Complete:
Grid Size: 5x5
Start: (0, 0), Goal: (4, 4)
Rewards: Goal=10, Move=-1, Wall Hit=-5


In [15]:
# Cell 2: Helper Functions (CORRECTED)

def draw_grid(robot_pos, history_log):
    """
    Creates an HTML table representation of the grid for visual output in Jupyter.
    Uses colors and symbols to show Start (S), Goal (G), and Robot (R).
    """
    html_output = '<table style="border-collapse: collapse; border: 3px solid #333;">'
    for r in range(GRID_SIZE):
        html_output += '<tr>'
        for c in range(GRID_SIZE):
            pos = (r, c)
            style = 'width:40px; height:40px; text-align:center; font-weight:bold; border:1px solid #ccc; font-size: 16px;'
            
            # --- Determine Cell Content and Color ---
            content = ''
            color = '#f0f0f0'

            if pos == START_POS:
                content = 'S'
                color = '#3498db' # Blue
            if pos == GOAL_POS:
                content = 'G'
                color = '#2ecc71' # Green

            # FIX APPLIED HERE: Checks if the position is directly in the list of past positions
            if pos in history_log and pos != robot_pos: 
                color = '#f1c40f50' # Light Yellow trail

            # Current Robot Position overrides all others
            if pos == robot_pos:
                content = 'R'
                color = '#e74c3c' # Red
                style += 'transform: scale(1.1);' # Highlight robot

            html_output += f'<td style="{style} background-color:{color};">{content}</td>'
        html_output += '</tr>'
    html_output += '</table>'
    display(HTML(html_output))


def simulate_move(current_pos, action):
    """Calculates the next position and the reward based on the action."""
    
    # Calculate potential new coordinates
    dr, dc = ACTIONS[action]
    new_r = current_pos[0] + dr
    new_c = current_pos[1] + dc
    
    # Check for wall hit (boundary conditions)
    if not (0 <= new_r < GRID_SIZE and 0 <= new_c < GRID_SIZE):
        # Wall Hit: position does not change
        return current_pos, REWARDS['WALL_HIT'], "Hit Wall"
    
    next_pos = (new_r, new_c)
    
    # Check for Goal
    if next_pos == GOAL_POS:
        return next_pos, REWARDS['GOAL'], "Reached Goal!"
    
    # Regular Move
    return next_pos, REWARDS['MOVE'], "Moved"

In [16]:
# Cell 3: Run the Simulation and Generate Synthetic Data

def run_rl_simulation():
    """
    Runs one episode of the robot trying to reach the goal using random actions.
    Records the experience to generate the synthetic training data.
    """
    print("--- STARTING EPISODE (RANDOM EXPLORATION) ---")
    current_pos = START_POS
    episode_data = []  # This will be the Synthetic Data Log
    
    # Initialize history for visualization
    history_pos = []
    
    step_count = 0
    
    while current_pos != GOAL_POS and step_count < MAX_STEPS:
        
        step_count += 1
        
        # 1. Choose a random action (Naive Policy)
        action = random.choice(ACTION_KEYS)
        
        # 2. Simulate the move and get feedback
        next_pos, reward, outcome_msg = simulate_move(current_pos, action)
        
        # 3. RECORD THE SYNTHETIC DATA POINT
        # Data structure: (Step, Current Pos, Action, Next Pos, Reward, Outcome)
        data_point = (step_count, current_pos, action, next_pos, reward, outcome_msg)
        episode_data.append(data_point)
        
        # 4. VISUALIZE THE MOVEMENT
        print(f"\nStep {step_count}: Action={action}, Reward={reward}")
        history_pos.append(current_pos)
        draw_grid(next_pos, history_pos)
        
        # Update state
        current_pos = next_pos
        
    print("\n--- EPISODE FINISHED ---")
    if current_pos == GOAL_POS:
        print(f"SUCCESS! Robot reached the goal in {step_count} steps.")
    else:
        print(f"FAILED. Robot ran out of steps ({MAX_STEPS} max).")

    return episode_data

# Run the simulation and store the generated data
synthetic_experience = run_rl_simulation()

--- STARTING EPISODE (RANDOM EXPLORATION) ---

Step 1: Action=LEFT, Reward=-5


0,1,2,3,4
R,,,,
,,,,
,,,,
,,,,
,,,,G



Step 2: Action=UP, Reward=-5


0,1,2,3,4
R,,,,
,,,,
,,,,
,,,,
,,,,G



Step 3: Action=LEFT, Reward=-5


0,1,2,3,4
R,,,,
,,,,
,,,,
,,,,
,,,,G



Step 4: Action=RIGHT, Reward=-1


0,1,2,3,4
S,R,,,
,,,,
,,,,
,,,,
,,,,G



Step 5: Action=LEFT, Reward=-1


0,1,2,3,4
R,,,,
,,,,
,,,,
,,,,
,,,,G



Step 6: Action=LEFT, Reward=-5


0,1,2,3,4
R,,,,
,,,,
,,,,
,,,,
,,,,G



Step 7: Action=LEFT, Reward=-5


0,1,2,3,4
R,,,,
,,,,
,,,,
,,,,
,,,,G



Step 8: Action=RIGHT, Reward=-1


0,1,2,3,4
S,R,,,
,,,,
,,,,
,,,,
,,,,G



Step 9: Action=UP, Reward=-5


0,1,2,3,4
S,R,,,
,,,,
,,,,
,,,,
,,,,G



Step 10: Action=RIGHT, Reward=-1


0,1,2,3,4
S,,R,,
,,,,
,,,,
,,,,
,,,,G



Step 11: Action=RIGHT, Reward=-1


0,1,2,3,4
S,,,R,
,,,,
,,,,
,,,,
,,,,G



Step 12: Action=UP, Reward=-5


0,1,2,3,4
S,,,R,
,,,,
,,,,
,,,,
,,,,G



Step 13: Action=LEFT, Reward=-1


0,1,2,3,4
S,,R,,
,,,,
,,,,
,,,,
,,,,G



Step 14: Action=UP, Reward=-5


0,1,2,3,4
S,,R,,
,,,,
,,,,
,,,,
,,,,G



Step 15: Action=LEFT, Reward=-1


0,1,2,3,4
S,R,,,
,,,,
,,,,
,,,,
,,,,G



Step 16: Action=DOWN, Reward=-1


0,1,2,3,4
S,,,,
,R,,,
,,,,
,,,,
,,,,G



Step 17: Action=LEFT, Reward=-1


0,1,2,3,4
S,,,,
R,,,,
,,,,
,,,,
,,,,G



Step 18: Action=LEFT, Reward=-5


0,1,2,3,4
S,,,,
R,,,,
,,,,
,,,,
,,,,G



Step 19: Action=RIGHT, Reward=-1


0,1,2,3,4
S,,,,
,R,,,
,,,,
,,,,
,,,,G



Step 20: Action=DOWN, Reward=-1


0,1,2,3,4
S,,,,
,,,,
,R,,,
,,,,
,,,,G



Step 21: Action=UP, Reward=-1


0,1,2,3,4
S,,,,
,R,,,
,,,,
,,,,
,,,,G



Step 22: Action=DOWN, Reward=-1


0,1,2,3,4
S,,,,
,,,,
,R,,,
,,,,
,,,,G



Step 23: Action=UP, Reward=-1


0,1,2,3,4
S,,,,
,R,,,
,,,,
,,,,
,,,,G



Step 24: Action=DOWN, Reward=-1


0,1,2,3,4
S,,,,
,,,,
,R,,,
,,,,
,,,,G



Step 25: Action=UP, Reward=-1


0,1,2,3,4
S,,,,
,R,,,
,,,,
,,,,
,,,,G



Step 26: Action=DOWN, Reward=-1


0,1,2,3,4
S,,,,
,,,,
,R,,,
,,,,
,,,,G



Step 27: Action=RIGHT, Reward=-1


0,1,2,3,4
S,,,,
,,,,
,,R,,
,,,,
,,,,G



Step 28: Action=UP, Reward=-1


0,1,2,3,4
S,,,,
,,R,,
,,,,
,,,,
,,,,G



Step 29: Action=RIGHT, Reward=-1


0,1,2,3,4
S,,,,
,,,R,
,,,,
,,,,
,,,,G



Step 30: Action=DOWN, Reward=-1


0,1,2,3,4
S,,,,
,,,,
,,,R,
,,,,
,,,,G



--- EPISODE FINISHED ---
FAILED. Robot ran out of steps (30 max).


In [17]:
# Cell 4: Display the Generated Synthetic Data Log

print("\n--- SYNTHETIC DATA LOG (EXPERIENCE TABLE) ---")

# Format for clear table output
header = ["Step", "Action", "Current Pos", "Next Pos", "Reward", "Outcome"]
print("| " + " | ".join(header) + " |")
print("|" + "---|" * len(header))

# Display each data point generated by the robot's exploration
for step, c_pos, action, n_pos, reward, outcome in synthetic_experience:
    print(f"| {step:<4} | {action:<6} | {str(c_pos):<11} | {str(n_pos):<9} | {reward:<6} | {outcome:<11} |")


--- SYNTHETIC DATA LOG (EXPERIENCE TABLE) ---
| Step | Action | Current Pos | Next Pos | Reward | Outcome |
|---|---|---|---|---|---|
| 1    | LEFT   | (0, 0)      | (0, 0)    | -5     | Hit Wall    |
| 2    | UP     | (0, 0)      | (0, 0)    | -5     | Hit Wall    |
| 3    | LEFT   | (0, 0)      | (0, 0)    | -5     | Hit Wall    |
| 4    | RIGHT  | (0, 0)      | (0, 1)    | -1     | Moved       |
| 5    | LEFT   | (0, 1)      | (0, 0)    | -1     | Moved       |
| 6    | LEFT   | (0, 0)      | (0, 0)    | -5     | Hit Wall    |
| 7    | LEFT   | (0, 0)      | (0, 0)    | -5     | Hit Wall    |
| 8    | RIGHT  | (0, 0)      | (0, 1)    | -1     | Moved       |
| 9    | UP     | (0, 1)      | (0, 1)    | -5     | Hit Wall    |
| 10   | RIGHT  | (0, 1)      | (0, 2)    | -1     | Moved       |
| 11   | RIGHT  | (0, 2)      | (0, 3)    | -1     | Moved       |
| 12   | UP     | (0, 3)      | (0, 3)    | -5     | Hit Wall    |
| 13   | LEFT   | (0, 3)      | (0, 2)    | -1     | Moved   