In [None]:
# Training loop (No Replay Buffer and No Target Network)
GAMMA = 0.99
EPS_START = 0.9 # Initial exploration rate, 1.0 means 100% exploration, 0.0 means 100% exploitation
EPS_END = 0.01
EPS_DECAY = 0.001
LEARNING_RATE = 3e-4

# Reset the environment
env.reset(seed=seed)
input_dim = env.observation_space.shape[0]  # 6 * size^2
output_dim = env.action_space.n  # 12 (6 faces * 2 rotations)

# Reset the model weights
policy_net = DQN(input_dim, output_dim)  # Initialize with random weights and biases
optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)
loss_fn = nn.SmoothL1Loss()
epsilon = EPS_START

num_episodes = 2000
max_steps = 10

memory = ReplayMemory(10000)  # Initialize replay memory with a capacity of 10,000
BATCH_SIZE = 32  # Size of the batch for training

# Reward tracking
reward_list = []

for episode in range(num_episodes):
    # Reset the game state for each episode
    state, info = env.reset()
    state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0) # [1, input_dim]
    total_reward = 0
    
    for step in range(max_steps):
        # Select action (e-greedy)
        if random.random() < epsilon: # explore
            action_tensor = torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long) # [1, 1] tensor for action
        else: # exploit
            with torch.no_grad():
                q_values = policy_net(state_tensor) # [1, output_dim], output_dim would be 12 as there are 12 actions
            action_tensor = torch.max(q_values, dim=1).indices.view(1, 1)  # Get the index of the action with the highest Q-value
            
        # Perform action in the environment
        observation, reward, terminated, truncated, _ = env.step(action_tensor.item())
        next_state_tensor = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
        reward_tensor = torch.tensor([reward], device=device) # [1] tensor for reward
        terminated_batch = torch.tensor([terminated], device=device, dtype=torch.bool) # [1] tensor for termination status
        done = terminated or truncated
        total_reward += reward
        
        # Store transition in replay memory
        memory.push(state_tensor, action_tensor, next_state_tensor, reward_tensor, terminated_batch)
        
        if len(memory) < BATCH_SIZE:
            break
        # Sample a batch from memory
        transitions = memory.sample(BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_state_batch = torch.cat([s for s in batch.next_state])
        terminated_batch = torch.cat(batch.terminated)
        

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = policy_net(state_batch).gather(1, action_batch)
        
        # Calculate target Q-value
        with torch.no_grad():
            # NOTE: We aren't using a target network here, just the policy_net itself. This code leads to the 'chasing a moving target' problem.
            next_q = policy_net(next_state_batch)              # [BATCH, output_dim]
            next_state_values = next_q.max(1).values           # [BATCH]
        next_state_values = next_state_values * (~terminated_batch).float()  # Zero out the next state values where the episode has ended
        
        # Compute the expected Q values
        expected_state_action_values = (reward_batch + GAMMA * next_state_values).unsqueeze(1)
    
        # Calculate loss
        loss = loss_fn(state_action_values, expected_state_action_values) # input, target
        
        optimizer.zero_grad() # Clear gradients
        loss.backward() # Backpropagation, this computes gradients
        optimizer.step() # Update weights based on gradients
        
        # Move to the next state
        state_tensor = next_state_tensor
        
        if done:
            break
        
    # Decay epsilon (less exploration over time)
    epsilon = max(EPS_END, epsilon - EPS_DECAY)
    
    reward_list.append(total_reward)
    
    print(f"Episode {episode+1}, Total Reward: {total_reward:.1f}, Epsilon: {epsilon:.3f}")

In [None]:
# Plot rewards moving avg over time
import matplotlib.pyplot as plt
import numpy as np

window_size = 50  # adjust this as needed
reward_array = np.array(reward_list)
moving_avg = np.convolve(reward_array, np.ones(window_size)/window_size, mode='valid')

plt.figure(figsize=(10, 5))
plt.plot(reward_list, color='lightgray', label='Raw Reward')
plt.plot(range(window_size - 1, len(reward_list)), moving_avg, color='blue', label=f'{window_size}-Step Moving Avg')
plt.xlabel('Step')
plt.ylabel('Reward')
plt.title('Reward per Step with Moving Average')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()