In [1]:
import gym
import numpy as np
import random
import copy

In [37]:
class Network:
        
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        self.W = [np.random.randn(input_dim, hidden_dim)*((2/(input_dim+hidden_dim))**0.5)]
        self.b = [np.zeros(hidden_dim)]
        self.gradW = [np.zeros((input_dim, hidden_dim))]
        self.gradb = [np.zeros(hidden_dim)]
        for i in range(num_layers):
            self.W.append(np.random.randn(hidden_dim, hidden_dim)*((1/hidden_dim)**0.5))
            self.b.append(np.zeros(hidden_dim))
            self.gradW.append(np.zeros((hidden_dim, hidden_dim)))
            self.gradb.append(np.zeros(hidden_dim))
        self.W.append(np.random.randn(hidden_dim, output_dim)*((2/(hidden_dim+output_dim))**0.5))
        self.b.append(np.zeros(output_dim))
        self.gradW.append(np.zeros((hidden_dim, output_dim)))
        self.gradb.append(np.zeros(output_dim))
        for i in range(len(self.W)):
            self.W[i] = np.array(self.W[i] , dtype = np.longdouble)
        for i in range(len(self.b)):
            self.b[i] = np.array(self.b[i] , dtype= np.longdouble )
        self.activations = []

    def forward(self, x):
        self.activations = [x.copy()]
        for i in range(len(self.W)-1):
            x = np.maximum(x @ self.W[i] + self.b[i], 0)
            self.activations.append(x.copy())
        x = x @ self.W[-1] + self.b[-1]
        return x

    def backward(self, error):
        self.gradW[-1] = self.activations[-1].reshape(-1, 1) @ error.reshape(1, -1)
        self.gradb[-1] = error.copy().T
        for i in reversed(range(len(self.W)-1)):
            error = (error @ self.W[i+1].T)  * (self.activations[i+1]>0).astype(int)
            self.gradW[i] = self.activations[i].reshape(-1, 1) @ error.reshape(1, -1)
            self.gradb[i] = error.copy().T
        self.gradW = [ np.clip(w,  1,  -1)for w in self.gradW]
        self.gradb = [ np.clip(b, 1,  -1)for b in self.gradb]



    def update(self, lr):
        for i in range(len(self.W)):
            self.W[i] = self.W[i] - lr * self.gradW[i]
            self.b[i] = self.b[i] - lr * self.gradb[i]



    def __repr__(self):
        return "(" + ", ".join([str(x.shape[0]) for x in self.W]) + ", " + str(self.W[-1].shape[1]) + ")"

In [38]:
env = gym.make("CartPole-v1")
env.reset()

  deprecation(
  deprecation(


array([ 0.04297874,  0.04063813, -0.00089307,  0.04530634], dtype=float32)

In [39]:
def getTargetNet(net):
    netCopy = Network(1,1,1,1)
    netCopy.W = copy.deepcopy(net.W)
    netCopy.b = copy.deepcopy(net.b)
    netCopy.gradb = copy.deepcopy(net.gradb)
    netCopy.gradW = copy.deepcopy(net.gradW)
    netCopy.activations = net.activations
    return netCopy

In [40]:
def policy(x):
    r = random.random()
    global epsilon
    if r < epsilon:
        return random.choice([0,1])
    else:
        return np.argmax(x)


In [53]:
net = Network(4, 16, 2, 1) # (MLP structure: 4 -- 16 -- 16 -- 2)
targetNet = getTargetNet(net)
learning_rate = 0.0001
gamma = 0.99 # for the horizon
epsilon = 1.0 # decay it with 0.999 after each episode and fix it at 0.1
epsilonDecay = 0.999

In [132]:
epsilon = 1.0
while True:
    state = env.reset()
    replayBuffer = []
    for i in range(1000):
        
        x = net.forward(state)
        action = policy(x)
        replayBuffer.append((state,x, action, env.step(action)))
        observation, reward, terminated, info = replayBuffer[-1][-1]
        if terminated:
            env.reset()
    errors = []
    for frame in np.random.choice(range(len(replayBuffer)), 320, replace=False) :
        frame = replayBuffer[frame]
        state,x , action, step = frame
        observation, reward, terminated, info = step
        # x = targetNet.forward(state)
        # net.gradW = targetNet.gradW
        # net.gradb = targetNet.gradb
        x = net.forward(state)
        if terminated:
            target = reward
        else:
            target = reward +  gamma * np.max(targetNet.forward(observation))
        error = [-1 * (target - x[action]) if action == i else 0 for i in [0,1]]
        net.backward(np.array(error).reshape(-1))
        net.update(learning_rate)
    targetNet = getTargetNet(net)
    epsilon *= epsilonDecay

KeyboardInterrupt: 

In [112]:
def estimatePerformance():
    global epsilon    
    counters = []
    exp = epsilon
    epsilon = -1
    for i in range(100):
        counter = 0
        terminated = False
        observation = env.reset()
        while not terminated:
            action = policy(net.forward(observation))
            observation, reward, terminated, info = env.step(action)
            counter += 1
        counters.append(counter)

    summ =  0
    for i in range(100):
        summ += counters[i]
    print(summ/100)
    epsilon = exp

In [86]:
net.forward(env.reset())

array([2.06445999e+12, 2.06454362e+12])

In [148]:
estimatePerformance()

9.28


In [44]:
action = 1
x = [0.1,1.1]
np.array([x[i] if action == i else 0 for i in [0,1]]).reshape(1,-1)


array([[0. , 1.1]])

In [131]:
epsilon

2.3788125040940492e-05

In [144]:
replayBuffer = []
net = Network(4, 16, 2, 1)
targetNet = getTargetNet(net)
while True:
    state = env.reset()
    for i in range(1000):
        x = net.forward(state)
        action = policy(x)

        replayBuffer.append((state,x, action, env.step(action)))
        observation, reward, terminated, info = replayBuffer[-1][-1]
        if terminated:
            env.reset()
        if(len(replayBuffer) < 32):
            continue
        for frame in np.random.choice(range(len(replayBuffer)), 32, replace=False) :
            frame = replayBuffer[frame]
            state,x , action, step = frame
            observation, reward, terminated, info = step
            x = net.forward(state)
            if not terminated: 
                reward += gamma * np.max(targetNet.forward(observation))
            error =  (reward - x[action]) * -1
            error = [error if action == i else 0 for i in [0,1]]
            
            net.backward(np.array(error))
            net.update(learning_rate)
    
    targetNet = getTargetNet(net)
    if epsilon > 10 ** -1:
        epsilon *= epsilonDecay
    replayBuffer = []
            
        

KeyboardInterrupt: 