In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

def distance_to_hoop(x, y):
    return np.sqrt((x - 30)**2 + (y - 8)**2)

class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(2, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 2)  # Output: speed and angle
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return torch.sigmoid(self.fc3(x)) * torch.tensor([30.0, np.pi / 2])

class REINFORCE():
    def __init__(self, lr=0.01, gamma=0.99):
        self.policy = PolicyNetwork()
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.gamma = gamma
        self.log_probs = []
        self.rewards = []
    
    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)
        action = self.policy(state)
        dist = torch.distributions.Normal(action, torch.tensor([1.0, 0.1]))
        sampled_action = dist.sample()
        log_prob = dist.log_prob(sampled_action).sum()
        self.log_probs.append(log_prob)
        return sampled_action.detach().numpy()
    
    def update_policy(self):
        R = 0
        returns = []
        for r in reversed(self.rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        loss = -sum([lp * R for lp, R in zip(self.log_probs, returns)])
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.log_probs = []
        self.rewards = []

def simulate_shot(v, alpha):
    x, y, z = 0, 0, 1.8  # Initial position
    dt = 0.01
    vh = v * np.cos(alpha)
    vv = v * np.sin(alpha)
    while z > 0:
        x += vh * dt
        z += vv * dt
        vv -= 9.8 * dt
        if np.abs(x - 30) < 0.1 and np.abs(z - 3.05) < 0.1:
            return 1.0  # Success
    return -1.0  # Failure

def train(agent, episodes=500):
    for _ in range(episodes):
        state = np.array([0, 1.8])
        action = agent.select_action(state)
        reward = simulate_shot(action[0], action[1])
        agent.rewards.append(reward)
        agent.update_policy()



In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

def distance_to_hoop(x, y):
    return np.sqrt((x - 30)**2 + (y - 8)**2)

class DecisionNetwork(nn.Module):
    def __init__(self):
        super(DecisionNetwork, self).__init__()
        self.fc1 = nn.Linear(1, 16)
        self.fc2 = nn.Linear(16, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x))

class MovementNetwork(nn.Module):
    def __init__(self):
        super(MovementNetwork, self).__init__()
        self.fc1 = nn.Linear(1, 16)
        self.fc2 = nn.Linear(16, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x)) * 10 + 5  # Move between 5 and 15 units

class ShootingNetwork(nn.Module):
    def __init__(self):
        super(ShootingNetwork, self).__init__()
        self.fc1 = nn.Linear(1, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 2)  # Output: speed and angle
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return torch.sigmoid(self.fc3(x)) * torch.tensor([30.0, np.pi / 2])

class REINFORCE():
    def __init__(self, lr=0.01, gamma=0.99):
        self.decision_policy = DecisionNetwork()
        self.movement_policy = MovementNetwork()
        self.shooting_policy = ShootingNetwork()
        self.optimizer = optim.Adam(
            list(self.decision_policy.parameters()) +
            list(self.movement_policy.parameters()) +
            list(self.shooting_policy.parameters()), lr=lr)
        self.gamma = gamma
        self.log_probs = []
        self.rewards = []
    
    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)
        move_prob = self.decision_policy(state)
        move_dist = self.movement_policy(state)
        shot_params = self.shooting_policy(state)
        
        move_dist = move_dist.item()
        move_decision = torch.bernoulli(move_prob).item()
        
        if move_decision > 0.5:
            new_x = state.item() + move_dist
            return "move", new_x
        else:
            dist = torch.distributions.Normal(shot_params, torch.tensor([1.0, 0.1]))
            sampled_shot = dist.sample()
            log_prob = dist.log_prob(sampled_shot).sum()
            self.log_probs.append(log_prob)
            return "shoot", sampled_shot.detach().numpy()
    
    def update_policy(self):
        R = 0
        returns = []
        for r in reversed(self.rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        loss = -sum([lp * R for lp, R in zip(self.log_probs, returns)])
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.log_probs = []
        self.rewards = []

def simulate_shot(v, alpha):
    x, y, z = 0, 0, 1.8  # Initial position
    dt = 0.01
    vh = v * np.cos(alpha)
    vv = v * np.sin(alpha)
    while z > 0:
        x += vh * dt
        z += vv * dt
        vv -= 9.8 * dt
        if np.abs(x - 30) < 0.1 and np.abs(z - 3.05) < 0.1:
            return 1.0  # Success
    return -1.0  # Failure

def train(agent, episodes=500):
    for _ in range(episodes):
        state = np.array([0])
        action_type, action = agent.select_action(state)
        
        if action_type == "move":
            state = np.array([action])  # Update player position
            reward = -0.1  # Slight penalty for movement
        else:
            reward = simulate_shot(action[0], action[1])
        
        agent.rewards.append(reward)
        agent.update_policy()

if __name__ == "__main__":
    agent = REINFORCE()
    train(agent)


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

def distance_to_hoop(x, y):
    return np.sqrt((x - 30)**2 + (y - 8)**2)

class DecisionNetwork(nn.Module):
    def __init__(self):
        super(DecisionNetwork, self).__init__()
        self.fc1 = nn.Linear(1, 16)
        self.fc2 = nn.Linear(16, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x))

class MovementNetwork(nn.Module):
    def __init__(self):
        super(MovementNetwork, self).__init__()
        self.fc1 = nn.Linear(1, 16)
        self.fc2 = nn.Linear(16, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x)) * 10 + 5  # Move between 5 and 15 units

class ShootingNetwork(nn.Module):
    def __init__(self):
        super(ShootingNetwork, self).__init__()
        self.fc1 = nn.Linear(1, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 2)  # Output: speed and angle
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return torch.sigmoid(self.fc3(x)) * torch.tensor([30.0, np.pi / 2])

class REINFORCE():
    def __init__(self, lr=0.01, gamma=0.99):
        self.decision_policy = DecisionNetwork()
        self.movement_policy = MovementNetwork()
        self.shooting_policy = ShootingNetwork()
        self.optimizer = optim.Adam(
            list(self.decision_policy.parameters()) +
            list(self.movement_policy.parameters()) +
            list(self.shooting_policy.parameters()), lr=lr)
        self.gamma = gamma
        self.log_probs = []
        self.rewards = []
    
    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)
        move_prob = self.decision_policy(state)
        move_dist = self.movement_policy(state)
        shot_params = self.shooting_policy(state)
        
        move_dist = move_dist.item()
        move_decision = torch.bernoulli(move_prob).item()
        
        if move_decision > 0.5:
            new_x = state.item() + move_dist
            return "move", new_x
        else:
            dist = torch.distributions.Normal(shot_params, torch.tensor([1.0, 0.1]))
            sampled_shot = dist.sample()
            log_prob = dist.log_prob(sampled_shot).sum()
            self.log_probs.append(log_prob)
            return "shoot", sampled_shot.detach().numpy(), state.item()
    
    def update_policy(self):
        R = 0
        returns = []
        for r in reversed(self.rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        loss = -sum([lp * R for lp, R in zip(self.log_probs, returns)])
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.log_probs = []
        self.rewards = []

def simulate_shot(x, v, alpha):
    z = 1.8  # Initial height
    dt = 0.01
    vh = v * np.cos(alpha)
    vv = v * np.sin(alpha)
    while z > 0:
        x += vh * dt
        z += vv * dt
        vv -= 9.8 * dt
        if np.abs(x - 30) < 0.1 and np.abs(z - 3.05) < 0.1:
            return 1.0  # Success
    return -1.0  # Failure

def train(agent, episodes=500):
    for _ in range(episodes):
        x_pos = 0  # Initial player position
        state = np.array([x_pos])
        action_type, action, x_pos = agent.select_action(state)
        
        if action_type == "move":
            state = np.array([action])  # Update player position
            reward = -0.1  # Slight penalty for movement
        else:
            reward = simulate_shot(x_pos, action[0], action[1])
        
        agent.rewards.append(reward)
        agent.update_policy()

if __name__ == "__main__":
    agent = REINFORCE()
    train(agent)
