In [1]:
import numpy as np
import gym
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
class Buffer:
    def __init__(self, buffer_size):
        
        self.buffer_size = buffer_size
        self.buffer = np.zeros((1, 4 + 1 + 1 + 4)) ## should be made variables
        
    def size(self):
        
        return len(self.buffer)
    
    def store(self, state, action, reward, next_state, done):
        
        new_data = 9999*np.ones((1, 4 + 1 + 1 + 4))
        new_data[0, 0:4] = state
        new_data[0, 4:6] = [action, reward]
        if not done:
            new_data[0, 6:] = next_state 
        else:
            new_data[0, 6:] = [np.nan, np.nan, np.nan, np.nan]
        self.buffer = np.concatenate((self.buffer, new_data), axis = 0)
        
        if np.all(self.buffer[0] == 0):
            self.buffer = self.buffer[1:]
        
        elif self.size() > self.buffer_size:
            self.buffer = self.buffer[1:]
            
    def sample(self, batch_size):
        
        return np.asarray(random.sample(list(self.buffer), batch_size))

In [32]:
class NN:
    def __init__(self, inp_size, out_size):
        
        super(NN, self).__init__()
        self.l1 = nn.Linear(inp_size, 128)
        self.l2 = nn.Linear(128, 128)
        self.l3 = nn.Linear(128, 128)
        self.l4 = nn.Linear(128, out_size)
    
    def forward(self, x):
        
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = F.relu(self.l3(x))
        x = self.l4(x)
        return x

class DQN:
    
    def __init__(self, inp_size, out_size):
        self.device = torch.device("cpu")
        self.dqn = NN(inp_size, out_size)
        
    def predict(self, state, action_set):
        inp = np.zeros((len(action_set), len(state)+1))
        inp[0][0:len(state)] = state
        inp[1][0:len(state)] = state
        inp[:,len(state)] = action_set
        
        with torch.no_grad():
            inp = torch.FloatTensor(inp, device= self.device)
            q_values = self.dqn.forward(inp).detach().numpy()
        return np.argmax(q_values), np.max(q_values)


In [37]:
## training block
buffer_size = 100
episode_time = 100
no_iters = 5

dqn = DQN(5,1)
lr = 1e-4
batch_size = 16

env = gym.make('CartPole-v0')
buffer = Buffer(buffer_size)

for e in range(no_iters):
    state = env.reset()
    for t in range(episode_time):
        #env.render()
        action, max_q = dqn.predict(state, [0,1])
        next_state, reward, done, _ = env.step(action)
        buffer.store(state, action, reward, next_state, done)
        state = next_state
        
        if done:
            break
            
        if buffer.size() < buffer_size - 2:
            continue
        
        mini_batch = buffer.sample(batch_size)
        
print(buffer.buffer[30:50])

[[ 1.43781653e-02  1.56588916e-01  3.05524168e-02 -3.21474253e-01
   1.00000000e+00  1.00000000e+00  1.75099436e-02  3.51262761e-01
   2.41229317e-02 -6.04367729e-01]
 [ 1.75099436e-02  3.51262761e-01  2.41229317e-02 -6.04367729e-01
   1.00000000e+00  1.00000000e+00  2.45351988e-02  5.46039192e-01
   1.20355771e-02 -8.89355918e-01]
 [ 2.45351988e-02  5.46039192e-01  1.20355771e-02 -8.89355918e-01
   1.00000000e+00  1.00000000e+00  3.54559827e-02  7.40995773e-01
  -5.75154124e-03 -1.17823123e+00]
 [ 3.54559827e-02  7.40995773e-01 -5.75154124e-03 -1.17823123e+00
   1.00000000e+00  1.00000000e+00  5.02758981e-02  9.36191946e-01
  -2.93161659e-02 -1.47271160e+00]
 [ 5.02758981e-02  9.36191946e-01 -2.93161659e-02 -1.47271160e+00
   1.00000000e+00  1.00000000e+00  6.89997371e-02  1.13165978e+00
  -5.87703978e-02 -1.77440508e+00]
 [ 6.89997371e-02  1.13165978e+00 -5.87703978e-02 -1.77440508e+00
   1.00000000e+00  1.00000000e+00  9.16329327e-02  1.32739281e+00
  -9.42584994e-02 -2.08476629e+00

nan