In [8]:
import torch.nn as nn 
import torch
import random
import numpy as np
import torch.optim as optim
import matplotlib.pyplot as plt
from collections import deque
import gymnasium as gym

class RNNQNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RNNQNetwork, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.i2h = nn.Linear(input_dim, hidden_dim)
        self.h2h = nn.Linear(hidden_dim, hidden_dim)
        self.h2o = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, hidden):
        x = torch.tensor(x, dtype=torch.float32).unsqueeze(0)  # Ensure x is a tensor and unsqueeze for batch dimension
        hidden = torch.tanh(self.i2h(x) + self.h2h(hidden))
        output = self.h2o(hidden)
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(batch_size, self.hidden_dim)
    
class DQNAgent:
    def __init__(self, env, hidden_dim=64, learning_rate=0.001, discount_factor=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995, buffer_size=10000, batch_size=64):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim

        self.memory = deque(maxlen=buffer_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.q_network = RNNQNetwork(env.observation_space.n, hidden_dim, env.action_space.n).to(self.device)  # Use shape[0] for input_dim
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.criterion = nn.MSELoss()

        self.hidden = self.q_network.init_hidden(1).to(self.device)

    def choose_action(self, state):
        q_values, self.hidden = self.q_network(state, self.hidden)
        if np.random.rand() < self.epsilon:
            return self.env.action_space.sample()  # Explore
        else:
            return torch.argmax(q_values).item()  # Exploit

    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def learn(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        # Convert lists to numpy arrays and concatenate
        states = np.array(states, dtype=np.float32)
        actions = np.array(actions, dtype=np.int64)
        rewards = np.array(rewards, dtype=np.float32)
        next_states = np.array(next_states, dtype=np.float32)
        dones = np.array(dones, dtype=np.float32)

        # Convert concatenated numpy arrays to PyTorch tensors
        states_tensor = torch.tensor(states).to(self.device)
        actions_tensor = torch.tensor(actions).unsqueeze(-1).to(self.device)
        rewards_tensor = torch.tensor(rewards).unsqueeze(-1).to(self.device)
        next_states_tensor = torch.tensor(next_states).to(self.device)
        dones_tensor = torch.tensor(dones).unsqueeze(-1).to(self.device)

        # Initialize hidden state for the first batch item
        #self.hidden = self.q_network.init_hidden(self.batch_size).to(self.device)

        current_q_values, _ = self.q_network(states_tensor, self.hidden)
        current_q_values = current_q_values.gather(1, actions_tensor)
        next_q_values, _ = self.q_network(next_states_tensor, self.hidden)  # Use the same hidden state for next states
        next_q_values = next_q_values.max(1)[0].unsqueeze(-1)
        target_q_values = rewards_tensor + self.discount_factor * next_q_values * (1 - dones_tensor)

        loss = self.criterion(current_q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def reset_hidden(self):
        self.hidden = self.q_network.init_hidden(1).to(self.device)


import numpy as np
import torch

def preprocess_state(state, rnn_q_network, device):
    """
    Preprocesses the state received from the environment by passing it through the RNNQNetwork to obtain the hidden state.
    
    Parameters:
    - state: Current state from the environment
    - rnn_q_network: RNNQNetwork model instance
    - device: Device (cpu or cuda) on which to run the computation
    
    Returns:
    - hidden_state: Hidden state generated by passing the state through the RNN
    """
    # Convert state to torch tensor and move to device
    state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    
    # Initialize hidden state for the first batch item
    hidden = rnn_q_network.init_hidden(1).to(device)
    
    # Pass state through the RNNQNetwork
    with torch.no_grad():
        _, hidden_state = rnn_q_network(state_tensor, hidden)
    
    return hidden_state.squeeze(0).cpu().numpy() 

from environment import DelaySampleToMatchEnv

#env=DelaySampleToMatchEnv(5)
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False)
# Instantiate the agent
print(env.observation_space.shape)
agent = DQNAgent(env)

n_episodes = 2500
win_pct_list = []
scores = []


for i in range(n_episodes):
    state = env.reset()  # Reset the environment
    done = False
    score = 0
    agent.reset_hidden()  # Reset hidden state at the beginning of each episode
    while not done:
        hidden_state = preprocess_state(state, agent.q_network, agent.device)  # Preprocess the state through RNN
        action = agent.choose_action(hidden_state)  # Choose action based on epsilon-greedy policy
        next_state, reward, done, truncated, info = env.step(action)  # Take the action
        agent.store_transition(state, action, reward, next_state, done)
        agent.learn()  # Update Q-network
        state = next_state  # Move to the next state
        score += reward
    scores.append(score)
    if i % 100 == 0:
        avg_score = np.mean(scores[-100:])
        win_pct_list.append(avg_score)
        print('episode', i, 'win pct %.2f' % avg_score, 'epsilon %.2f' % agent.epsilon)

# Plotting the win percentage over episodes
plt.plot(win_pct_list)
plt.xlabel('Episodes (x100)')
plt.ylabel('Win Percentage')
plt.title('Win Percentage over Time')
plt.show()




()


TypeError: must be real number, not dict