In [1]:
import numpy as np 
import gym 
import matplotlib.pyplot as plt

### actor critic worked. TODO: gradient check

In [2]:
env_name='CartPole-v1'
# env_name='CartPole-v0'
env = gym.make(env_name)

In [3]:
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

In [4]:
class TwoNN():
    #numpy model
    def __init__(self, n_feature, n_hidden, n_output):
        print('simple two layer neural network')
        print(f'creating nn: #input:{n_feature} #hidden:{n_hidden} #output:{n_output}')
        self.n_output=n_output
        self.model={}
        self.model['w1']=np.random.randn(n_hidden, n_feature)/ np.sqrt(n_hidden) # "Xavier" initialization
        self.model['b1']= np.zeros((1,n_hidden))
        self.model['w2']=np.random.randn(n_output, n_hidden)/ np.sqrt(n_output) # "Xavier" initialization
        self.model['b2'] = np.zeros((1,n_output))
 
    def softmax(self, Z): 
        expz = np.exp(Z - np.max(Z))                    #to prevent overflow
        return expz / expz.sum(axis=1, keepdims=True)   #reduce columns for each row.
    
    def forward(self, X):
        """
        X: Nxn_feature
        """
        self.X=X
        self.h=X  @ self.model['w1'].T  + self.model['b1']             #(NxD)@(DxH)  
        self.h[self.h<0]=0                           #relu 
        z=self.h @ self.model['w2'].T + self.model['b2']
        if self.n_output==1: #regression.
            self.out=z
            return self.out
        
        self.out=self.softmax(z)    #(NxH)@(HxO)
        return self.out

In [5]:
class MyAdam:
    def __init__(self, weights, learning_rate=0.001, beta1 = .9, beta2 = .999):
        """
        weights: key,value pair of weights.
        """
        self.weights=weights
        self.learning_rate=learning_rate
        self.beta1=beta1
        self.beta2=beta2
        self.M = {k: np.zeros_like(v) for k, v in weights.items()}
        self.V = {k: np.zeros_like(v) for k, v in weights.items()}
        self.eps = 1e-8  # Smoothing to avoid division by zero
        self.t=0
        
    def update(self, grads):
        self.t +=1
        for k in grads:
            self.M[k] = self.beta1 * self.M[k] + (1. - self.beta1) * grads[k]
            self.V[k] = self.beta2 * self.V[k] + (1. - self.beta2) * grads[k]**2

            m_k_hat = self.M[k] / (1. - self.beta1**self.t)  #bias correction
            v_k_hat = self.V[k] / (1. - self.beta2**self.t)  #bias correction
            self.weights[k] -= self.learning_rate * m_k_hat / (np.sqrt(v_k_hat) + self.eps)

In [6]:
actor_np=TwoNN(state_dim, 128, n_actions) 
critic_np=TwoNN(state_dim, 128, 1) 

simple two layer neural network
creating nn: #input:4 #hidden:128 #output:2
simple two layer neural network
creating nn: #input:4 #hidden:128 #output:1


In [7]:
learning_rate=1e-3 
gamma = 0.99

learning_rate=1e-5 
learning_rate = 0.001
nepisode=500
lr=learning_rate

In [8]:
adam_ac=MyAdam(weights=actor_np.model, learning_rate=learning_rate)
adam_cr=MyAdam(weights=critic_np.model, learning_rate=learning_rate)

In [9]:
# plt.plot(episode_rewards)

In [10]:
def grads_manual_np(tP, y_train, th, X, W2, adv, N=1):
    # print('shape:', tP.shape, y_train.shape)
    grads={}
    
    dz2=(tP-y_train)*adv  /N 
    dw2=(dz2.T  @  th) 
    dh=(dz2 @   W2  )  
    db2=np.sum(dz2, axis=0, keepdims=True)  
 
    dh[th<=0]=0                     #equal sign is extremely important. 
 
    dw1 = dh.T @ X.reshape(1,-1)
    db1 = np.sum(dh, axis=0, keepdims=True) 
    grads={'w1':dw1, 'b1':db1, 'w2':dw2, 'b2':db2}
    return grads

In [11]:
def grads_manual_critic_np(yhat, y, th, th_next, state, next_state, w2, done, N=1):
    grads={}
    
    dz2=(yhat - y)  /N 
    cth=(1-done)*gamma*th_next-th
    dw2=(dz2.T  @  cth) 

    dh=(dz2 @   w2  ) 
 
    db2=np.sum(dz2*((1-done)*gamma-1), axis=0, keepdims=True)  
     
    dz1n=dh.copy()
    dz1n[th_next<=0]=0                     #equal sign is extremely important. 
    
     
    dz1=dh.copy()
    dz1[th<=0]=0                     #equal sign is extremely important. 

     
    dw1=(1-done)* gamma* dz1n.T @  next_state.reshape(1,-1) - dz1.T @ state.reshape(1,-1)  
    
    
    db1=np.sum( ((1-done)*gamma*dz1n -dz1), axis=0 , keepdims=True)
    
    grads={'w1':dw1, 'b1':db1, 'w2':dw2, 'b2':db2}
    return grads

In [12]:
episode_rewards = []
 
for i in range(nepisode+1):
    done = False
    total_reward = 0
    state = env.reset()


    while not done: 
        probs_np=actor_np.forward(state)
        ath_np=actor_np.h.copy()
        a= np.random.choice([0,1],p=probs_np[0])   
        next_state, reward, done, info = env.step(a)
        total_reward += reward   
        
        y_next=critic_np.forward(next_state.reshape(1,-1) )
        th_next=critic_np.h.copy() 
        y=critic_np.forward(state.reshape(1,-1))
        th=critic_np.h.copy() 
        yhat=reward + (1-done)*gamma*y_next
        
        
        grads_critic_np=grads_manual_critic_np(yhat, y, th, th_next, state, next_state, critic_np.model['w2'], done, N=1)
        adam_cr.update(grads_critic_np)
        # for k,v in grads_critic_np.items():
        #     critic_np.model[k] -=lr*grads_critic_np[k]
        
        advantage = yhat - y
        yt=np.eye(2)[a].reshape(1,-1)  
        grads_actor_np=grads_manual_np(probs_np, yt, ath_np, state, actor_np.model['w2'], advantage , N=1)

        # for k,v in grads_actor_np.items():
        #     actor_np.model[k] -=lr*grads_actor_np[k]
        adam_ac.update(grads_actor_np)
        
        
        state = next_state
            
    episode_rewards.append(total_reward)
    if i%20==0:
        avg20=np.mean(episode_rewards[-20:])
        print(f'episode:{i} score:{total_reward} last_20:{avg20}')
        if avg20>=env.spec.reward_threshold:
            print(f'-------solved in {i} steps-------')
            break

episode:0 score:25.0 last_20:25.0
episode:20 score:75.0 last_20:36.5
episode:40 score:37.0 last_20:51.25
episode:60 score:19.0 last_20:59.45
episode:80 score:24.0 last_20:28.8
episode:100 score:45.0 last_20:79.9
episode:120 score:14.0 last_20:94.5
episode:140 score:307.0 last_20:151.4
episode:160 score:500.0 last_20:389.0
episode:180 score:487.0 last_20:396.45
episode:200 score:14.0 last_20:452.2
episode:220 score:500.0 last_20:257.75
episode:240 score:134.0 last_20:415.0
episode:260 score:500.0 last_20:460.0
episode:280 score:218.0 last_20:393.8
episode:300 score:17.0 last_20:96.3
episode:320 score:500.0 last_20:144.75
episode:340 score:220.0 last_20:219.7
episode:360 score:271.0 last_20:234.85
episode:380 score:120.0 last_20:164.5
episode:400 score:103.0 last_20:106.55
episode:420 score:105.0 last_20:65.05
episode:440 score:479.0 last_20:354.05
episode:460 score:500.0 last_20:500.0
-------solved in 460 steps-------


In [13]:
def evaluate_model_np(env, actor, num_ep=20): 
    """
    run env num_ep times and return average reward.
    """
    rewards = []
    for ep in range(num_ep):
        state = env.reset() 
        ep_reward=0
        for tt in range(2000):
            probs = actor.forward(state.reshape(1,-1))  
            action=np.argmax(probs[0])

            state, reward, done, _ = env.step(action)
            ep_reward+=reward
            if done:
                # print(f"ep:{ep} reward:{ep_reward}")
                break 
        
        rewards.append(ep_reward)
    return np.array(rewards).mean()

In [14]:
avg_r=evaluate_model_np(env, actor_np)
print('avg_r',avg_r)

avg_r 500.0


In [15]:
# -------solved in 220 steps------

In [17]:
state = env.reset() 
ep_reward=0
for tt in range(2000):
    env.render()
    probs = actor_np.forward(state.reshape(1,-1))  
    action=np.argmax(probs[0]) 
    state, reward, done, _ = env.step(action)
    ep_reward+=reward
    if done:
        break 
env.close()
print('total reward:', ep_reward)

total reward: 500.0
