## Implementation Steps: 

1. Setup the Environment: Define the game environment with restricted view.
2. Define the Neural Networks: Implement the PAN and STN using PyTorch.
3. Training the Networks: Train PAN to evaluate points and STN to make movement decisions.
4. Integration: Integrate the trained networks into the game loop for decision-making.

In [27]:
class GameEnvironment:
    def __init__(self, grid_height, grid_width, max_balls, restricted_view):
        self.grid_height = grid_height
        self.grid_width = grid_width
        self.max_balls = max_balls
        self.restricted_view = restricted_view
        self.grid = np.zeros((grid_height, grid_width), dtype=np.uint8)
        self.agent_pos = grid_width // 2  # Start in the middle of the grid

    def reset(self):
        self.grid = np.zeros((self.grid_height, self.grid_width), dtype=np.uint8)
        self.balls = []  # To track balls store them as a list of tuples (row, col, type)
        self.score = 0
        self.caught_balls = 0
        self.grid[-1, self.agent_pos] = 255  # Use the bottom row for the agent
        return self.grid.copy()
    
    def step(self, action):
        # Move agent based on action
        if action == 1 and self.agent_pos > 0:  # Move left
            self.agent_pos -= 1
        elif action == 2 and self.agent_pos < self.grid_width - 1:  # Move right
            self.agent_pos += 1
        # Note: action == 0 means stay, so no movement code is needed for that case

        # Generate new balls at the top of the grid
        num_new_balls = np.random.randint(1, self.max_balls + 1)
        for _ in range(num_new_balls):
            ball_col = np.random.randint(0, self.grid_width)
            ball_type = np.random.choice([1, 2])  # 1 for red, 2 for blue
            self.balls.append([0, ball_col, ball_type])  # Append new ball at top row with random column

        # Move existing balls down one row and prepare for checking catches
        balls_to_remove = []
        for ball in self.balls:
            ball[0] += 1  # Move ball down one row
            if ball[0] >= self.grid_height:  # Ensure ball does not go out of bounds
                balls_to_remove.append(ball)
                continue
            if ball[0] == self.grid_height - 1:  # Ball is exactly at the bottom row
                if ball[1] == self.agent_pos:  # And in the same column as the agent
                    self.caught_balls += 1
                    self.score += 20 if ball[2] == 2 else 10  # Update score based on ball type
                    balls_to_remove.append(ball)
        
        # Remove balls that have reached the bottom or been caught
        for ball in balls_to_remove:
            self.balls.remove(ball)

        # Clear grid and redraw
        self.grid = np.zeros((self.grid_height, self.grid_width), dtype=np.uint8)
        for ball in self.balls:
            self.grid[ball[0], ball[1]] = 1 if ball[2] == 1 else 2  # 1 for red, 2 for blue
        self.grid[-1, self.agent_pos] = 255  # Redraw agent

        return self.grid.copy(), self.score, self.caught_balls

    def get_restricted_view(self):
        left_bound = max(0, self.agent_pos - self.restricted_view)
        right_bound = min(self.grid_width, self.agent_pos + self.restricted_view + 1)
        view = self.grid[:, left_bound:right_bound]

        # If the view is smaller than expected, pad it
        if view.shape[1] < (2 * self.restricted_view + 1):
            pad_width = (2 * self.restricted_view + 1) - view.shape[1]
            if left_bound == 0:
                view = np.pad(view, ((0, 0), (pad_width, 0)), mode='constant', constant_values=0)
            elif right_bound == self.grid_width:
                view = np.pad(view, ((0, 0), (0, pad_width)), mode='constant', constant_values=0)

        return view

env = GameEnvironment(grid_height=6, grid_width=10, max_balls=3, restricted_view=2)


2. Define the neural networks

In [28]:
class PointAssessmentNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PointAssessmentNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

class SectionTransitionNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SectionTransitionNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.softmax(self.fc2(x), dim=-1)  # Output probabilities for actions: [stay, move left, move right]
        return x

# Calculate the correct input size based on the restricted view and grid height
restricted_view = env.restricted_view
input_size = (2 * restricted_view + 1) * env.grid_height
hidden_size = 128
output_size_pan = 1
output_size_stn = 3

print(f"Input size for the neural networks: {input_size}")

pan = PointAssessmentNetwork(input_size, hidden_size, output_size_pan)
stn = SectionTransitionNetwork(input_size, hidden_size, output_size_stn)

Input size for the neural networks: 30


3. Training the networks

In [29]:
# Training loop for PAN and STN
def train_pan(env, pan, episodes=1000):
    optimizer = optim.Adam(pan.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    for episode in range(episodes):
        env.reset()
        total_loss = 0
        for _ in range(10):  # Number of steps per episode
            state = env.get_restricted_view()
            state_flat = state.flatten()
            state_tensor = torch.FloatTensor(state_flat)
            
            target_points = torch.FloatTensor([random.uniform(0, 100)])  # Random target points for example
            
            optimizer.zero_grad()
            output = pan(state_tensor)
            loss = loss_fn(output, target_points)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        if episode % 100 == 0:
            print(f"Episode {episode}, Loss: {total_loss / 10}")

def train_stn(env, stn, episodes=1000):
    optimizer = optim.Adam(stn.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    for episode in range(episodes):
        env.reset()
        total_loss = 0
        for _ in range(10):  # Number of steps per episode
            state = env.get_restricted_view()
            state_flat = state.flatten()
            state_tensor = torch.FloatTensor(state_flat)
            
            target_action = torch.LongTensor([random.choice([0, 1, 2])])  # Random action
            
            optimizer.zero_grad()
            output = stn(state_tensor)
            loss = loss_fn(output.unsqueeze(0), target_action)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        if episode % 100 == 0:
            print(f"Episode {episode}, Loss: {total_loss / 10}")

train_pan(env, pan)
train_stn(env, stn)


Episode 0, Loss: 2307.4967072457075
Episode 100, Loss: 2375.699955800269
Episode 200, Loss: 4032.4110695406794
Episode 300, Loss: 2832.6987617492678
Episode 400, Loss: 4412.2582347869875
Episode 500, Loss: 3517.6521265983583
Episode 600, Loss: 2914.7560592651366
Episode 700, Loss: 3213.670837831497
Episode 800, Loss: 1931.8686107724905
Episode 900, Loss: 2594.632898235321
Episode 0, Loss: 1.0880303740501405
Episode 100, Loss: 1.2514447510242461
Episode 200, Loss: 1.1514447450637817
Episode 300, Loss: 1.3514447569847108
Episode 400, Loss: 1.0514447391033173
Episode 500, Loss: 1.5514447689056396
Episode 600, Loss: 1.2514447510242461
Episode 700, Loss: 1.2514447510242461
Episode 800, Loss: 1.0514447391033173
Episode 900, Loss: 1.3514447569847108


4. Integration into Game Loop

In [31]:
env = GameEnvironment(grid_height=6, grid_width=10, max_balls=3, restricted_view=2)

# Ensure the grid size and restricted view settings are consistent
print(f"Grid height: {env.grid_height}, Grid width: {env.grid_width}, Restricted view: {env.restricted_view}")


Grid height: 6, Grid width: 10, Restricted view: 2


In [35]:
def play_game_with_nn(env, pan, stn, episodes=10, max_steps=100):
    for episode in range(episodes):
        state = env.reset()
        total_score = 0
        step_count = 0
        done = False
        
        while not done and step_count < max_steps:
            restricted_view = env.get_restricted_view()
            state_flat = restricted_view.flatten()
            state_tensor = torch.FloatTensor(state_flat)
            
            # Print the shape of the flattened state tensor
            print(f"Shape of the flattened state tensor: {state_tensor.shape}")
            
            # Use PAN to assess points
            points = pan(state_tensor).item()
            
            # Use STN to decide action
            action_probs = stn(state_tensor).detach().numpy()
            action = np.argmax(action_probs)
            
            # Take action in the environment
            state, score, caught_balls = env.step(action)
            total_score += score
            step_count += 1
            
            # Check if the episode is done
            done = len(env.balls) == 0  # Example condition to end episode

        print(f"Episode {episode}, Total Score: {total_score}, Steps: {step_count}")

play_game_with_nn(env, pan, stn)


Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state tensor: torch.Size([30])
Shape of the flattened state

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque

class GameEnvironment:
    def __init__(self, grid_height, grid_width, max_balls, restricted_view):
        self.grid_height = grid_height
        self.grid_width = grid_width
        self.max_balls = max_balls
        self.restricted_view = restricted_view
        self.grid = np.zeros((grid_height, grid_width), dtype=np.uint8)
        self.agent_pos = grid_width // 2  # Start in the middle of the grid

    def reset(self):
        self.grid = np.zeros((self.grid_height, self.grid_width), dtype=np.uint8)
        self.balls = []  # To track balls store them as a list of tuples (row, col, type)
        self.score = 0
        self.caught_balls = 0
        self.grid[-1, self.agent_pos] = 255  # Use the bottom row for the agent
        return self.grid.copy()
    
    def step(self, action):
        # Move agent based on action
        if action == 1 and self.agent_pos > 0:  # Move left
            self.agent_pos -= 1
        elif action == 2 and self.agent_pos < self.grid_width - 1:  # Move right
            self.agent_pos += 1
        # Note: action == 0 means stay, so no code is needed for that case

        # Generate new balls at the top of the grid
        num_new_balls = np.random.randint(1, self.max_balls + 1)
        for _ in range(num_new_balls):
            ball_col = np.random.randint(0, self.grid_width)
            ball_type = np.random.choice([1, 2])  # 1 for red, 2 for blue
            self.balls.append([0, ball_col, ball_type])  # Append new ball at top row with random column

        # Move existing balls down one row and prepare for checking catches
        balls_to_remove = []
        for ball in self.balls:
            ball[0] += 1  # Move ball down one row
            if ball[0] >= self.grid_height:  # Ensure ball does not go out of bounds
                balls_to_remove.append(ball)
                continue
            if ball[0] == self.grid_height - 1:  # Ball is exactly at the bottom row
                if ball[1] == self.agent_pos:  # And in the same column as the agent
                    self.caught_balls += 1
                    self.score += 20 if ball[2] == 2 else 10  # Update score based on ball type
                    balls_to_remove.append(ball)
        
        # Remove balls that have reached the bottom or been caught
        for ball in balls_to_remove:
            self.balls.remove(ball)

        # Clear grid and redraw
        self.grid = np.zeros((self.grid_height, self.grid_width), dtype=np.uint8)
        for ball in self.balls:
            self.grid[ball[0], ball[1]] = 1 if ball[2] == 1 else 2  # 1 for red, 2 for blue
        self.grid[-1, self.agent_pos] = 255  # Redraw agent

        return self.grid.copy(), self.score, self.caught_balls

    def get_restricted_view(self):
        left_bound = max(0, self.agent_pos - self.restricted_view)
        right_bound = min(self.grid_width, self.agent_pos + self.restricted_view + 1)
        view = self.grid[:, left_bound:right_bound]

        # If the view is smaller than expected, pad it
        if view.shape[1] < (2 * self.restricted_view + 1):
            pad_width = (2 * self.restricted_view + 1) - view.shape[1]
            if left_bound == 0:
                view = np.pad(view, ((0, 0), (pad_width, 0)), mode='constant', constant_values=0)
            elif right_bound == self.grid_width:
                view = np.pad(view, ((0, 0), (0, pad_width)), mode='constant', constant_values=0)

        return view

class PointAssessmentNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1):
        super(PointAssessmentNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

class SectionTransitionNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=3):
        super(SectionTransitionNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.softmax(self.fc2(x), dim=-1)
        return x

def train_pan(env, pan, episodes=1000, batch_size=64, lr=0.001):
    optimizer = optim.Adam(pan.parameters(), lr=lr)
    memory = deque(maxlen=10000)
    
    def compute_loss(batch):
        states = []
        targets = []
        for state, target in batch:
            states.append(state)
            targets.append(target)
        
        states = torch.FloatTensor(states)
        targets = torch.FloatTensor(targets)
        
        predictions = pan(states).squeeze()
        loss = F.mse_loss(predictions, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        return loss
    
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0
        
        while not done and steps < 100:
            restricted_view = env.get_restricted_view()
            state_flat = restricted_view.flatten()
            
            # Simulate the target points for training
            target_points = np.random.uniform(0, 100)  # target calculation
            
            memory.append((state_flat, target_points))
            
            if len(memory) >= batch_size:
                batch = random.sample(memory, batch_size)
                compute_loss(batch)
            
            action = random.choice([0, 1, 2])
            next_state, reward, _ = env.step(action)
            total_reward += reward
            done = len(env.balls) == 0
            state = next_state
            steps += 1
        
        if episode % 100 == 0:
            print(f"PAN Training - Episode {episode}, Total Reward: {total_reward}")

def train_stn(env, pan, stn, episodes=1000, batch_size=64, lr=0.001):
    optimizer = optim.Adam(stn.parameters(), lr=lr)
    memory = deque(maxlen=10000)
    
    def compute_loss(batch):
        states = []
        actions = []
        for state, action in batch:
            states.append(state)
            actions.append(action)
        
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        
        predictions = stn(states)
        loss = F.cross_entropy(predictions, actions)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        return loss
    
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0
        
        while not done and steps < 100:
            restricted_view = env.get_restricted_view()
            state_flat = restricted_view.flatten()
            points = pan(torch.FloatTensor(state_flat)).item()
            
            # Create the input for STN (points + steps to move to next section)
            stn_input = np.append(state_flat, points)
            target_action = random.choice([0, 1, 2])  # action calculation
            
            memory.append((stn_input, target_action))
            
            if len(memory) >= batch_size:
                batch = random.sample(memory, batch_size)
                compute_loss(batch)
            
            action = target_action
            next_state, reward, _ = env.step(action)
            total_reward += reward
            done = len(env.balls) == 0
            state = next_state
            steps += 1
        
        if episode % 100 == 0:
            print(f"STN Training - Episode {episode}, Total Reward: {total_reward}")

# Define environment
env = GameEnvironment(grid_height=6, grid_width=10, max_balls=3, restricted_view=2)

# Define and train Point Assessment Network (PAN)
input_size_pan = (2 * env.restricted_view + 1) * env.grid_height
hidden_size_pan = 128
pan = PointAssessmentNetwork(input_size_pan, hidden_size_pan)

train_pan(env, pan)

# Define and train Section Transition Network (STN)
input_size_stn = input_size_pan + 1  # Adding 1 for the points from PAN
hidden_size_stn = 128
output_size_stn = 3
stn = SectionTransitionNetwork(input_size_stn, hidden_size_stn, output_size_stn)

train_stn(env, pan, stn)


PAN Training - Episode 0, Total Reward: 10590
PAN Training - Episode 100, Total Reward: 16020
PAN Training - Episode 200, Total Reward: 13740
PAN Training - Episode 300, Total Reward: 11870
PAN Training - Episode 400, Total Reward: 16040
PAN Training - Episode 500, Total Reward: 15150
PAN Training - Episode 600, Total Reward: 14430
PAN Training - Episode 700, Total Reward: 22560
PAN Training - Episode 800, Total Reward: 13720
PAN Training - Episode 900, Total Reward: 14000
STN Training - Episode 0, Total Reward: 12830
STN Training - Episode 100, Total Reward: 17530
STN Training - Episode 200, Total Reward: 16860
STN Training - Episode 300, Total Reward: 15530
STN Training - Episode 400, Total Reward: 5990
STN Training - Episode 500, Total Reward: 7840
STN Training - Episode 600, Total Reward: 19180
STN Training - Episode 700, Total Reward: 16970
STN Training - Episode 800, Total Reward: 5940
STN Training - Episode 900, Total Reward: 22070


In [40]:
import torch.nn.functional as F

class QNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

input_size = (2 * env.restricted_view + 1) * env.grid_height
hidden_size = 128
output_size = 3  # Stay, Move Left, Move Right

q_network = QNetwork(input_size, hidden_size, output_size)
target_q_network = QNetwork(input_size, hidden_size, output_size)
target_q_network.load_state_dict(q_network.state_dict())
target_q_network.eval()


QNetwork(
  (fc1): Linear(in_features=30, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=3, bias=True)
)

In [None]:
import random
from collections import deque

def train_dqn(env, q_network, target_q_network, episodes=1000, batch_size=64, gamma=0.99, lr=0.001, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.995):
    optimizer = optim.Adam(q_network.parameters(), lr=lr)
    memory = deque(maxlen=10000)
    epsilon = epsilon_start
    
    def get_action(state, epsilon):
        if random.random() < epsilon:
            return random.choice([0, 1, 2])  # Random action (explore)
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                q_values = q_network(state_tensor)
                return q_values.argmax().item()  # Best action (exploit)
    
    def compute_td_loss(batch):
        states, actions, rewards, next_states, dones = zip(*batch)
        
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        q_values = q_network(states)
        next_q_values = target_q_network(next_states)
        
        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_values.max(1)[0]
        expected_q_value = rewards + gamma * next_q_value * (1 - dones)
        
        loss = F.mse_loss(q_value, expected_q_value.detach())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        return loss
    
    for episode in range(episodes):
        state = env.reset().flatten()
        total_reward = 0
        done = False
        
        while not done:
            action = get_action(state, epsilon)
            next_state, reward, caught_balls = env.step(action)
            next_state_flat = next_state.flatten()
            total_reward += reward
            done = len(env.balls) == 0  # Condition to end episode
            
            memory.append((state, action, reward, next_state_flat, done))
            state = next_state_flat
            
            if len(memory) >= batch_size:
                batch = random.sample(memory, batch_size)
                compute_td_loss(batch)
        
        epsilon = max(epsilon_end, epsilon_decay * epsilon)
        
        if episode % 10 == 0:
            target_q_network.load_state_dict(q_network.state_dict())
            print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {epsilon}")

train_dqn(env, q_network, target_q_network)
