In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import gym
from torch import nn
import matplotlib.pyplot as plt
import math

## challange solved in 3 weeks.

In [2]:
env_name='CartPole-v1'
env_name='CartPole-v0'
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

In [3]:
seed=4
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

N,D=1, 4
H= 128     
C=2 

sw1 = 0.04*torch.randn(H , D,  dtype=torch.float) 
sb1 = torch.zeros((1,H))
sw2 = 0.04*torch.randn(C, H, dtype=torch.float) 
sb2 = torch.zeros((1,C))

sw1c = torch.randn(H , D,  dtype=torch.float)   #for critic
sb1c = torch.zeros((1,H))                           #for critic
sw2c = torch.randn(1, H, dtype=torch.float)    #for critic
sb2c = torch.zeros((1,1))                           #for critic


print('input features: ',D ,' output class: ',C)

input features:  4  output class:  2


In [4]:
def t(x): return torch.from_numpy(x).float()

In [5]:
class Adam:
    #same performance with torch.optim.Adam
    def __init__(self, model_params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8):
        self.params = list(model_params)
        self.lr = lr
        self.beta_1, self.beta_2 = betas
        self.eps = eps
        self.M= [torch.zeros_like(p) for p in self.params]
        self.V = [torch.zeros_like(p) for p in self.params]
        self.n_steps = 0
        
    def zero_grad(self):
        for param in self.params:
            param.grad = None

    @torch.no_grad()
    def step(self):
        self.n_steps += 1 
        for i in range(len(self.params)): 
            dw=self.params[i].grad
             
            self.M[i]= self.M[i]*self.beta_1 + (1-self.beta_1) * dw
            self.V[i] = self.V[i] *self.beta_2 + (1 - self.beta_2)* dw**2
 
            m_hat = self.M[i] / (1 - self.beta_1 ** self.n_steps)  #bias correction
            v_hat = self.V[i] / (1 - self.beta_2 ** self.n_steps) #bias correction
             
            self.params[i] -= self.lr * m_hat / (torch.sqrt(v_hat)+self.eps)


In [6]:
class NN_Open_Torch:
    def __init__(self, D, H, C):
        self.tW1 = torch.randn(H , D,  dtype=torch.float, requires_grad=True)
        self.tb1 = torch.zeros((1,H), requires_grad=True)
        self.tW2 = torch.randn(C,H, dtype=torch.float, requires_grad=True)
        self.tb2 = torch.zeros((1,C), requires_grad=True)
        self.C=C
        self.params=[self.tW1, self.tb1, self.tW2, self.tb2]
        self.ws={'w1':self.tW1, 'b1':self.tb1, 'w2':self.tW2, 'b2':self.tb2}
        
    def parameters(self):
        return self.params 
    
    def zero_grad(self):
        for param in self.params:
            param.grad=None
        
    def forward(self, X):
        th= torch.relu( X   @ self.tW1.T  + self.tb1) 
        z=th @self.tW2.T  + self.tb2 
        if self.C==1:
            return th, z
        
        tP=torch.softmax( z , axis=1)
        return th, tP
    
    def grads_manual(self, tP, y_train, th, X):
        grads={}

        dz2=(tP-y_train)  
        dW2=(dz2.T  @  th)/N  
        dh=(dz2 @  self.tW2  )/N
        db2=torch.sum(dz2 , axis=0 ) /N


        dz1=torch.tensor(dh)
        dz1[th<=0]=0 

        dW1= dz1.T @ X 
        db1=torch.sum(dz1, axis=0 )

        grads={'w1':dW1, 'b1':db1, 'w2':dW2, 'b2':db2}
        return grads

In [7]:
def grads_auto(model):
    grads={}
    with torch.no_grad():
        grads['w1']=model.tW1.grad.clone()
        grads['b1']=model.tb1.grad.clone()
        grads['w2']=model.tW2.grad.clone()
        grads['b2']=model.tb2.grad.clone()
        
        model.tb1.grad.zero_() 
        model.tW1.grad.zero_() 
        model.tb2.grad.zero_() 
        model.tW2.grad.zero_() 
    return grads

In [8]:
def evaluate_model(env, num_ep=20):
    """
    run env num_ep times and return average reward.
    """
    rewards = []
    for ep in range(num_ep):
        state = env.reset() 
        ep_reward=0
        for tt in range(2000):
            h,probs = actor.forward(t(state))
            dist = torch.distributions.Categorical(probs=probs[0])
            action = dist.sample()
            action=action.detach().numpy() 

            state, reward, done, _ = env.step(action)
            ep_reward+=reward
            if done:
                # print(f"ep:{ep} reward:{ep_reward}")
                break 
        
        rewards.append(ep_reward)
    return np.array(rewards).mean()

### run1: base 59.85

In [9]:
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fcb68032470>

In [10]:
actor=NN_Open_Torch(state_dim, 128, n_actions)
critic=NN_Open_Torch(state_dim, 128, 1) 

In [11]:
with torch.no_grad():
    actor.tW1.data=sw1.clone()
    actor.tb1.data=sb1.clone()
    actor.tW2.data=sw2.clone()
    actor.tb2.data=sb2.clone() 
    
    critic.tW1.data=sw1c.clone()
    critic.tb1.data=sb1c.clone()
    critic.tW2.data=sw2c.clone()
    critic.tb2.data=sb2c.clone() 

In [12]:
gamma = 0.99

In [13]:

episode_rewards = []
lr=0.01
for i in range(200+1):
    done = False
    total_reward = 0
    state = env.reset()


    while not done:
        x=t(state ) 
        th, probs=actor.forward(x) 
        dist = torch.distributions.Categorical(probs=probs[0])
        action = dist.sample()
        
        a=action.detach().data.numpy()  
        next_state, reward, done, info = env.step(a)
        advantage = reward + (1-done)*gamma*critic.forward(t(next_state))[1] - critic.forward(t(state))[1]
        
        total_reward += reward
        state = next_state

        critic_loss = 0.5*advantage.pow(2).mean() 
        critic.zero_grad()
        critic_loss.backward() 
        # adam_critic.step() 
        with torch.no_grad():
            for param in critic.parameters():
                param.data -=lr*param.grad 
        
        
         
        actor_loss = -dist.log_prob(action)*advantage.detach()
        actor.zero_grad() 
        actor_loss.backward()
        # adam_actor.step() 
        
        with torch.no_grad(): 
            for param in actor.parameters():
                param.data -=lr*param.grad
            
    episode_rewards.append(total_reward)
    if i%20==0:
        avg20=np.mean(episode_rewards[-20:])
        print(f'episode:{i} score:{total_reward} last_20:{avg20}')
        if avg20>=env.spec.reward_threshold:
            print(f'-------solved in {i} steps-------')
            break

episode:0 score:15.0 last_20:15.0
episode:20 score:15.0 last_20:14.45
episode:40 score:15.0 last_20:19.1
episode:60 score:62.0 last_20:32.25
episode:80 score:24.0 last_20:41.75
episode:100 score:14.0 last_20:48.6
episode:120 score:21.0 last_20:62.35
episode:140 score:19.0 last_20:21.45
episode:160 score:29.0 last_20:23.7
episode:180 score:44.0 last_20:35.1
episode:200 score:50.0 last_20:57.2


In [14]:
avg_r=evaluate_model(env)
print('avg_r',avg_r)

avg_r 68.0


### end of run1

### now run2: using auto_grad

In [15]:
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

actor=NN_Open_Torch(state_dim, 128, n_actions)
critic=NN_Open_Torch(state_dim, 128, 1) 

with torch.no_grad():
    actor.tW1.data=sw1.clone()
    actor.tb1.data=sb1.clone()
    actor.tW2.data=sw2.clone()
    actor.tb2.data=sb2.clone() 
    
    critic.tW1.data=sw1c.clone()
    critic.tb1.data=sb1c.clone()
    critic.tW2.data=sw2c.clone()
    critic.tb2.data=sb2c.clone() 

In [16]:
episode_rewards = []
lr=0.01
for i in range(200+1):
    done = False
    total_reward = 0
    state = env.reset()


    while not done:
        x=t(state ) 
        th, probs=actor.forward(x) 
        dist = torch.distributions.Categorical(probs=probs[0])
        action = dist.sample()
        
        a=action.detach().data.numpy()  
        next_state, reward, done, info = env.step(a)
        yhat=reward + (1-done)*gamma*critic.forward(t(next_state))[1]
        y=critic.forward(t(state))[1]
        advantage = yhat - y
        
        total_reward += reward
        state = next_state

        critic_loss = 0.5*advantage.pow(2).mean() 
        critic.zero_grad()
        critic_loss.backward() 
        # gradsm_critic=grads_manual(yhat,  y, th, state.reshape(1,-1), critic.tW2)   
        gradsm_critic=grads_auto(critic)
        
        with torch.no_grad(): 
            for k,v in gradsm_critic.items():
                critic.ws[k] -=lr*gradsm_critic[k]
 
        actor_loss = -dist.log_prob(action)*advantage.detach()
        actor.zero_grad() 
        actor_loss.backward()
        gradsm_actor=grads_auto(actor)
        
        with torch.no_grad(): 
            for k,v in gradsm_actor.items():
                actor.ws[k] -=lr*gradsm_actor[k]
                
            
    episode_rewards.append(total_reward)
    if i%20==0:
        avg20=np.mean(episode_rewards[-20:])
        print(f'episode:{i} score:{total_reward} last_20:{avg20}')
        if avg20>=env.spec.reward_threshold:
            print(f'-------solved in {i} steps-------')
            break

episode:0 score:15.0 last_20:15.0
episode:20 score:15.0 last_20:14.45
episode:40 score:15.0 last_20:19.1
episode:60 score:62.0 last_20:32.25
episode:80 score:24.0 last_20:41.75
episode:100 score:14.0 last_20:48.6
episode:120 score:21.0 last_20:62.35
episode:140 score:19.0 last_20:21.45
episode:160 score:29.0 last_20:23.7
episode:180 score:44.0 last_20:35.1
episode:200 score:50.0 last_20:57.2


In [17]:
avg_r=evaluate_model(env)
print('avg_r',avg_r)

avg_r 68.0


### end of run2
### Now, run3: actor manual_grad, critic augo_grad

In [18]:
def grads_manual(tP, y_train, th, X, W2, adv, N=1):
    # print('shape:', tP.shape, y_train.shape)
    grads={}
    
    dz2=(tP-y_train)*adv  /N 
    dW2=(dz2.T  @  th) 
    dh=(dz2 @   W2  ) 
    db2=torch.sum(dz2 , axis=0 ) 


    dz1=torch.tensor(dh)   
    dz1[th<=0]=0                     #equal sign is extremely important. 

    dW1=( dz1.T @ X) 
    db1=torch.sum(dz1, axis=0 ) 
    
    grads={'w1':dW1, 'b1':db1, 'w2':dW2, 'b2':db2}
    return grads

In [19]:
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

actor=NN_Open_Torch(state_dim, 128, n_actions)
critic=NN_Open_Torch(state_dim, 128, 1) 

with torch.no_grad():
    actor.tW1.data=sw1.clone()
    actor.tb1.data=sb1.clone()
    actor.tW2.data=sw2.clone()
    actor.tb2.data=sb2.clone() 
    
    critic.tW1.data=sw1c.clone()
    critic.tb1.data=sb1c.clone()
    critic.tW2.data=sw2c.clone()
    critic.tb2.data=sb2c.clone() 

In [20]:
episode_rewards = []
lr=0.01
for i in range(200+1):
    done = False
    total_reward = 0
    state = env.reset()


    while not done:
        x=t(state ) 
        ath, probs=actor.forward(x) 
        dist = torch.distributions.Categorical(probs=probs[0])
        action = dist.sample()
        
        a=action.detach().data.numpy()  
        next_state, reward, done, info = env.step(a)
        yhat=reward + (1-done)*gamma*critic.forward(t(next_state))[1]
        y=critic.forward(t(state))[1]
        advantage = yhat - y
        
        total_reward += reward 

        critic_loss = 0.5*advantage.pow(2).mean() 
        critic.zero_grad()
        critic_loss.backward()    
        gradsm_critic=grads_auto(critic)
        
        with torch.no_grad(): 
            for k,v in gradsm_critic.items():
                critic.ws[k] -=lr*gradsm_critic[k]
 
        actor_loss = -dist.log_prob(action)*advantage.detach()
        actor.zero_grad() 
        actor_loss.backward()
        gradsm_actor_auto=grads_auto(actor)
        yt=np.eye(2)[action]
        yt=yt.reshape(1,-1)
        gradsm_actor=grads_manual(probs.detach(),  t(yt),  ath, state.reshape(1,-1), actor.tW2, advantage)   
        
        with torch.no_grad(): 
            for k,v in gradsm_actor.items():
                actor.ws[k] -=lr*gradsm_actor[k]
        
        state = next_state
            
    episode_rewards.append(total_reward)
    if i%20==0:
        avg20=np.mean(episode_rewards[-20:])
        print(f'episode:{i} score:{total_reward} last_20:{avg20}')
        if avg20>=env.spec.reward_threshold:
            print(f'-------solved in {i} steps-------')
            break

episode:0 score:15.0 last_20:15.0


  dz1=torch.tensor(dh)


episode:20 score:15.0 last_20:14.45
episode:40 score:15.0 last_20:19.1
episode:60 score:62.0 last_20:32.25
episode:80 score:24.0 last_20:41.75
episode:100 score:14.0 last_20:48.6
episode:120 score:21.0 last_20:62.35
episode:140 score:19.0 last_20:21.45
episode:160 score:29.0 last_20:23.7
episode:180 score:44.0 last_20:35.1
episode:200 score:50.0 last_20:57.2


In [21]:
avg_r=evaluate_model(env)
print('avg_r',avg_r)

avg_r 68.0


### end of run3

### run 4: actor, critic both manual grad

In [22]:
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

actor=NN_Open_Torch(state_dim, 128, n_actions)
critic=NN_Open_Torch(state_dim, 128, 1) 

with torch.no_grad():
    actor.tW1.data=sw1.clone()
    actor.tb1.data=sb1.clone()
    actor.tW2.data=sw2.clone()
    actor.tb2.data=sb2.clone() 
    
    critic.tW1.data=sw1c.clone()
    critic.tb1.data=sb1c.clone()
    critic.tW2.data=sw2c.clone()
    critic.tb2.data=sb2c.clone() 

In [23]:
def grads_manual_critic(yhat, y, th, th_next, state, next_state, w2, done, N=1):
    grads={}
    
    dz2=(yhat - y)  /N 
    cth=(1-done)*gamma*th_next-th
    dw2=(dz2.T  @  cth) 

    dh=(dz2 @   w2  ) 

    db2=torch.sum(dz2*((1-done)*gamma-1) , axis=0 ) 

    
    # dz1n=torch.tensor(dh)
    dz1n=dh.detach().clone()
    dz1n[th_next<=0]=0                     #equal sign is extremely important. 
    
    
    # dz1=torch.tensor(dh) 
    dz1=dh.detach().clone()
    dz1[th<=0]=0                     #equal sign is extremely important. 

     
    # dw1=(1-done)*(gamma* ( dz1n.T @ t(next_state.reshape(1,-1)) ) - dz1.T @ t(state.reshape(1,-1) ) )
    dw1=(1-done)* gamma* dz1n.T @ t(next_state.reshape(1,-1)) - dz1.T @ t(state.reshape(1,-1) ) 
    
    
    db1=torch.sum( ((1-done)*gamma*dz1n -dz1), axis=0 )
    
    grads={'w1':dw1, 'b1':db1, 'w2':dw2, 'b2':db2}
    return grads

In [24]:
episode_rewards = []
lr=0.01
for i in range(200+1):
    done = False
    total_reward = 0
    state = env.reset()


    while not done:
        x=t(state ) 
        ath, probs=actor.forward(x) 
        dist = torch.distributions.Categorical(probs=probs[0])
        action = dist.sample()
        
        a=action.detach().data.numpy()  
        next_state, reward, done, info = env.step(a)
        
        th_next, y_next=critic.forward(t(next_state))
        yhat=reward + (1-done)*gamma*y_next
        th, y=critic.forward(t(state)) 
        advantage = yhat - y
       
        total_reward += reward 

        critic_loss = 0.5*advantage.pow(2).mean()  
        gradsm_critic=grads_manual_critic(yhat, y, th, th_next, state, next_state, critic.tW2, done)
        
        with torch.no_grad(): 
            for k,v in gradsm_critic.items():
                critic.ws[k] -=lr*gradsm_critic[k]
 
        actor_loss = -dist.log_prob(action)*advantage.detach() 
        yt=np.eye(2)[action]
        yt=yt.reshape(1,-1)
        gradsm_actor=grads_manual(probs.detach(),  t(yt),  ath, state.reshape(1,-1), actor.tW2, advantage)   
        
        with torch.no_grad(): 
            for k,v in gradsm_actor.items():
                actor.ws[k] -=lr*gradsm_actor[k]
        
        state = next_state
            
    episode_rewards.append(total_reward)
    if i%20==0:
        avg20=np.mean(episode_rewards[-20:])
        print(f'episode:{i} score:{total_reward} last_20:{avg20}')
        if avg20>=env.spec.reward_threshold:
            print(f'-------solved in {i} steps-------')
            break

episode:0 score:15.0 last_20:15.0


  dz1=torch.tensor(dh)


episode:20 score:15.0 last_20:14.45
episode:40 score:15.0 last_20:19.1
episode:60 score:62.0 last_20:32.25
episode:80 score:24.0 last_20:41.75
episode:100 score:14.0 last_20:48.6
episode:120 score:21.0 last_20:62.35
episode:140 score:19.0 last_20:21.45
episode:160 score:29.0 last_20:23.7
episode:180 score:44.0 last_20:35.1
episode:200 score:50.0 last_20:57.2


In [25]:
avg_r=evaluate_model(env)
print('avg_r',avg_r)

avg_r 68.0
