In [35]:
from torch import nn 
from torch.functional import F
import torch
import numpy as np
device='cuda'

    
class DQN(nn.Module):

    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

        self.optimizer=torch.optim.Adam(self.parameters(), lr=0.001)
        self.loss=nn.MSELoss()
        device='cuda'
        self.to(device)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

class Agent():
    def __init__(self, env, lr, gamma, eps_start, eps_end, eps_dec):
        self.lr=lr
        self.gamma=gamma
        #self.n_states=n_states
        #self.n_actions=n_actions
        self.epsilon=eps_start
        self.eps_min=eps_end
        self.eps_dec=eps_dec
        self.env=env
        self.num_states = self.env.observation_space.n
        self.num_actions = self.env.action_space.n

        self.Q=DQN(1, self.num_actions)


    def choose_action(self, state):
        if np.random.random() > self.epsilon:
            state = torch.tensor([state], dtype=torch.float).to(device)
            actions=self.Q.forward(state)
            action=torch.argmax(actions).item()
        else:
            action=torch.tensor(np.random.choice(
                [i for i in range(self.num_actions) ])).item()
        return action
    
    def decrement_epsilon(self):
        self.epsilon=self.epsilon-self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def learn(self, state, action, reward, state_):
        self.Q.optimizer.zero_grad()
        states = torch.tensor([state], dtype=torch.float).to(device)
        actions = torch.tensor([action]).to(device)
        rewards = torch.tensor([reward]).to(device)
        states_ = torch.tensor([state_], dtype=torch.float).to(device)

        q_pred = self.Q.forward(states)[actions]
        q_next = self.Q.forward(states_).max()
        q_target = rewards + self.gamma*q_next

        loss = self.Q.loss(q_target, q_pred).to(device)
        loss.backward()
        self.Q.optimizer.step()
        self.decrement_epsilon()
    
#DQN(4, 2)#(torch.rand(1, 4).to('cuda'))

#agent=Agent(0.001, 0.99, 4, 2, 1.0, 0.01, 0.001)
#agent.choose_action([1, 2, 3, 4])

In [38]:
import gymnasium as gym

# Create the environment
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False)

agent=Agent(env, 0.001, 0.99, 1.0, 0.01, 0.001)

n_episodes = 2500
# Training loop
win_pct_list=[]
scores= []
for i in range(n_episodes):
    state, info = agent.env.reset()  # Reset the environment
    done = False
    score=0
    while not done:
        action = agent.choose_action(state) # Choose action based on epsilon-greedy policy
        next_state, reward, done, truncated, info = agent.env.step(action)  # Take the action
        #print('boom',next_state)
        agent.learn(state, action, reward, next_state)  # Update Q-table
        state = next_state  # Move to the next state
        score+=reward
    scores.append(score)
    if i%100==0:
        avg_score=np.mean(scores[-100:])
        win_pct_list.append(avg_score)
        print('episode', i, 'win pct %.2f' % avg_score, 'epsilon %.2f' % agent.epsilon)

import matplotlib.pyplot as plt
plt.plot(win_pct_list)
plt.show()


episode 0 win pct 0.00 epsilon 1.00
episode 100 win pct 0.01 epsilon 0.01
