In [7]:
import torch as T # for the neural network
import torch.nn as nn # for the neural network
import torch.nn.functional as F # for the activation functions
import torch.optim as optim # for the optimizer

import numpy as np # for the replay buffer

import struct # for converting bytes to floats
import socket # for connecting to the server
import json # for parsing the server's response
import random # for generating random actions

In [8]:
TCP_IP = "127.0.0.1"
TCP_PORT = 9876

# start a server
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # allow the port to be reused immediately after the server is killed
sock.bind((TCP_IP, TCP_PORT))

# listen for incoming connections
sock.listen(1)

# keep trying until a connection is established
print("Waiting for connection...")

# setup with godot
conn, addr = sock.accept()
print("Connection established with: ", addr)

# this is the outline of the main observation-action-reward loop
# sends random data as a test
doTest = False
while doTest:
    
    # finally we wait for the reward
    reward = conn.recv(32)
    reward = struct.unpack('f', reward)[0]
    print("Received reward: ", reward)
    
    # send a ready message to signal the python script is ready
    conn.send("ready".encode())

    # first we wait for godot to send an observation
    observation = conn.recv(4096)
    observation = json.loads(observation.decode())
    print("Received observation: ", observation)

    # then we send an action
    action = random.choice([0, 1, 2])
    print("Sending action: ", action)
    conn.send(action.to_bytes(1, byteorder='big'))



Waiting for connection...
Connection established with:  ('127.0.0.1', 53180)


In [None]:
# pytorch setup
device = T.device("cuda" if T.cuda.is_available() else "cpu")

# model definition
class DQN(nn.Module):
    
    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(n_observations, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# now we use the observation, action, and reward to train the neural network
# we will use the DQN algorithm to train the neural network

num_iterations = 999_999_999

# model parameters
n_observations = 49
n_actions = 3

# hyperparameters
lr = 1e-3

model = DQN(n_observations, n_actions).to(device)
target_model = DQN(n_observations, n_actions).to(device)
target_model.load_state_dict(model.state_dict())
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()

gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01

# flush godot as it sends state and reward first, we can discard this
_discard = conn.recv(32)
_discard = conn.recv(4096)

for iteration in range(num_iterations):

    # send ready
    conn.send("ready".encode())

    # simulate the current state as a random tensor
    # state = T.randn((1, n_observations), device=device)
    
    # get state from godot
    data = conn.recv(4096)
    state_json = json.loads(data.decode())
    state_vals = list(state_json.values())
    state = T.tensor([state_vals], dtype=T.float32, device=device)

    # epsilon-greedy action selection
    if random.random() < epsilon:
        action = random.randint(0, n_actions - 1)
    else:
        with T.no_grad():
            q_vals = model(state)
            action = int(T.argmax(q_vals, dim=1))
    
    # send action to godot
    conn.send(action.to_bytes(1, byteorder='big'))

    # simulate next state, reward
    # next_state = T.randn((1, n_observations), device=device)
    
    # get next state from godot

    # get reward from godot
    reward = conn.recv(32)
    reward = struct.unpack('f', reward)[0]
    
    # get next state from godot
    data = conn.recv(4096)
    state_json = json.loads(data.decode())
    state_vals = list(state_json.values())
    next_state = T.tensor([state_vals], dtype=T.float32, device=device)

    # current Q value
    q_value = model(state)[0, action]
    
    # target Q value computation
    with T.no_grad():
        next_q_vals = target_model(next_state)
        max_next_q = T.max(next_q_vals)
        target_q = reward + gamma * max_next_q
    
    loss = criterion(q_value, target_q)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay
    
    # periodically update the target network
    if iteration % 100 == 0:
        target_model.load_state_dict(model.state_dict())
        print(f"Iteration {iteration}: Loss = {loss.item():.4f}, Epsilon = {epsilon:.4f}")

Iteration 0: Loss = 0.5846, Epsilon = 0.9950
Iteration 100: Loss = 0.0309, Epsilon = 0.6027
Iteration 200: Loss = 0.0001, Epsilon = 0.3651
Iteration 300: Loss = 0.1850, Epsilon = 0.2212
Iteration 400: Loss = 0.0009, Epsilon = 0.1340
Iteration 500: Loss = 0.0277, Epsilon = 0.0812
Iteration 600: Loss = 0.0012, Epsilon = 0.0492
Iteration 700: Loss = 0.0192, Epsilon = 0.0298
Iteration 800: Loss = 0.0137, Epsilon = 0.0180
Iteration 900: Loss = 0.0000, Epsilon = 0.0109
Iteration 1000: Loss = 0.0000, Epsilon = 0.0100
Iteration 1100: Loss = 0.0000, Epsilon = 0.0100
Iteration 1200: Loss = 0.0000, Epsilon = 0.0100
Iteration 1300: Loss = 0.0050, Epsilon = 0.0100
Iteration 1400: Loss = 0.3167, Epsilon = 0.0100
Iteration 1500: Loss = 0.0314, Epsilon = 0.0100
Iteration 1600: Loss = 0.0394, Epsilon = 0.0100
Iteration 1700: Loss = 0.0006, Epsilon = 0.0100
Iteration 1800: Loss = 0.0003, Epsilon = 0.0100
Iteration 1900: Loss = 0.0000, Epsilon = 0.0100


ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host