<a href="https://colab.research.google.com/github/nahianAl/DQN/blob/main/DQN_w_replay.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gym
import torch
import random

from torch.autograd import Variable
from collections import deque


In [3]:
class DQN():

    def __init__(self, n_state, n_action, n_hidden , lr):
        
        self.criterion = torch.nn.MSELoss()

        self.model = torch.nn.Sequential(

            torch.nn.Linear(n_state, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden,n_action)
        )

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
    
    
    def predict(self,s):
        with torch.no_grad(): #why no_grad?
            return self.model(torch.Tensor(s))

    def update(self, s, y):

        pred = self.model(torch.Tensor(s))

        loss = self.criterion(pred,torch.Tensor(y))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [None]:
def eps_greedy_policy(neural_net, eps, n_action):

    
    def policy(state):
        
        if random.random() > eps:

            q_value = neural_net.predict(state)
            return torch.argmax(q_value).item()

        else:
            return random.randint(0, n_action - 1 )
    
    return policy 
    
        

In [None]:
def replay(self, memory, replay_size, gamma):

    if len(memory) >= replay_size:

        replay_data = random.sample(memory, replay_size)

        states = []
        td_targets = []

        for state, action, next_state,reward, is_done in replay_data:

            states.append(state)
            
            q_values = self.predict(state).tolist()

            if is_done:
                q_values[action] = reward 

            else:

                q_values_next = self.predict(next_state)
                q_values[action] = reward + gamma*torch.max(q_values_next).item()

            td_targets.append(q_values)

        self.update(states, td_targets)

In [None]:
#Global 

env = gym.envs.make("MountainCar-v0")

n_state = env.observation_space.shape[0]
n_action = env.action_space.n
n_hidden = 50
lr = 0.001

dqn = DQN(n_state, n_action, n_hidden, lr)

memory = deque(maxlen = 10000)

In [1]:
def train_loop(env , neural_net, episodes, replay_size,
               gamma = 1.0,  eps = 0.1, eps_decay = 0.99):

    for episode in range(episodes):

        policy = eps_greedy_policy(neural_net, eps, n_action)

        state = env.reset()
        is_done = False

        while not is_done:
            
            action = policy(state)

            next_state, reward, is_done, _ = env.step(action)
            
            total_reward_episode

            memory.append((state, action , next_state, 
                           reward, is_done ))
            
            if is_done:
                break

            neural_net.replay(memory, replay_size, gamma)

            state = next_state

        
        eps = max(eps * eps_decay, 0.01)





In [None]:
n_episode = 600
replay_size = 20

In [None]:
total_reward_episode = [0] * n_episode
train_loop(env, dqn, n_episode, replay_size, gamma = 0.9, eps = 0.3 )


