In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import gym
from torch import nn
import matplotlib.pyplot as plt
import math

## challange solved in 3 weeks (torch only, np yet ...)

In [2]:
env_name='CartPole-v1'
env_name='CartPole-v0'
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

In [3]:
seed=4
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

N,D=1, 4
H= 128     
C=2 

sw1 = 0.04*torch.randn(H , D,  dtype=torch.float) 
sb1 = torch.zeros((1,H))
sw2 = 0.04*torch.randn(C, H, dtype=torch.float) 
sb2 = torch.zeros((1,C))

sw1c = torch.randn(H , D,  dtype=torch.float)   #for critic
sb1c = torch.zeros((1,H))                           #for critic
sw2c = torch.randn(1, H, dtype=torch.float)    #for critic
sb2c = torch.zeros((1,1))                           #for critic


print('input features: ',D ,' output class: ',C)

input features:  4  output class:  2


In [4]:
def t(x): return torch.from_numpy(x).float()

In [5]:
def tn(tnsr): return tnsr.detach().numpy().copy()

In [6]:
class Adam:
    #same performance with torch.optim.Adam
    def __init__(self, model_params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8):
        self.params = list(model_params)
        self.lr = lr
        self.beta_1, self.beta_2 = betas
        self.eps = eps
        self.M= [torch.zeros_like(p) for p in self.params]
        self.V = [torch.zeros_like(p) for p in self.params]
        self.n_steps = 0
        
    def zero_grad(self):
        for param in self.params:
            param.grad = None

    @torch.no_grad()
    def step(self):
        self.n_steps += 1 
        for i in range(len(self.params)): 
            dw=self.params[i].grad
             
            self.M[i]= self.M[i]*self.beta_1 + (1-self.beta_1) * dw
            self.V[i] = self.V[i] *self.beta_2 + (1 - self.beta_2)* dw**2
 
            m_hat = self.M[i] / (1 - self.beta_1 ** self.n_steps)  #bias correction
            v_hat = self.V[i] / (1 - self.beta_2 ** self.n_steps) #bias correction
             
            self.params[i] -= self.lr * m_hat / (torch.sqrt(v_hat)+self.eps)


In [7]:
class NN_Open_Torch:
    def __init__(self, D, H, C):
        self.tW1 = torch.randn(H , D,  dtype=torch.float, requires_grad=True)
        self.tb1 = torch.zeros((1,H), requires_grad=True)
        self.tW2 = torch.randn(C,H, dtype=torch.float, requires_grad=True)
        self.tb2 = torch.zeros((1,C), requires_grad=True)
        self.C=C
        self.params=[self.tW1, self.tb1, self.tW2, self.tb2]
        self.ws={'w1':self.tW1, 'b1':self.tb1, 'w2':self.tW2, 'b2':self.tb2}
        
    def parameters(self):
        return self.params 
    
    def zero_grad(self):
        for param in self.params:
            param.grad=None
        
    def forward(self, X):
        th= torch.relu( X   @ self.tW1.T  + self.tb1) 
        z=th @self.tW2.T  + self.tb2 
        if self.C==1:
            return th, z
        
        tP=torch.softmax( z , axis=1)
        return th, tP


In [8]:
def grads_auto(model):
    grads={}
    with torch.no_grad():
        grads['w1']=model.tW1.grad.clone()
        grads['b1']=model.tb1.grad.clone()
        grads['w2']=model.tW2.grad.clone()
        grads['b2']=model.tb2.grad.clone()
        
        model.tb1.grad.zero_() 
        model.tW1.grad.zero_() 
        model.tb2.grad.zero_() 
        model.tW2.grad.zero_() 
    return grads

In [9]:
def evaluate_model(env, num_ep=20):
    np.random.seed(seed)
    env.seed(seed)
    torch.manual_seed(seed)

    """
    run env num_ep times and return average reward.
    """
    rewards = []
    for ep in range(num_ep):
        state = env.reset() 
        ep_reward=0
        for tt in range(2000):
            h,probs = actor.forward(t(state))
            action=torch.argmax(probs)
            # dist = torch.distributions.Categorical(probs=probs[0])
            # action = dist.sample()
            # action=action.detach().numpy() 

            state, reward, done, _ = env.step(action.numpy())
            ep_reward+=reward
            if done:
                # print(f"ep:{ep} reward:{ep_reward}")
                break 
        
        rewards.append(ep_reward)
    return np.array(rewards).mean()

In [10]:
gamma = 0.99
nepisode=200
nepisode=500
lr=0.01

### run1: base 59.85

In [11]:
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f94e65cb490>

In [12]:
actor=NN_Open_Torch(state_dim, 128, n_actions)
critic=NN_Open_Torch(state_dim, 128, 1) 

In [13]:
with torch.no_grad():
    actor.tW1.data=sw1.clone()
    actor.tb1.data=sb1.clone()
    actor.tW2.data=sw2.clone()
    actor.tb2.data=sb2.clone() 
    
    critic.tW1.data=sw1c.clone()
    critic.tb1.data=sb1c.clone()
    critic.tW2.data=sw2c.clone()
    critic.tb2.data=sb2c.clone() 

In [14]:

episode_rewards = []

for i in range(nepisode+1):
    done = False
    total_reward = 0
    state = env.reset()


    while not done:
        x=t(state ) 
        th, probs=actor.forward(x) 
        dist = torch.distributions.Categorical(probs=probs[0])
        action = dist.sample() 
        a=action.detach().data.numpy()  
        
        
        next_state, reward, done, info = env.step(a)
        advantage = reward + (1-done)*gamma*critic.forward(t(next_state))[1] - critic.forward(t(state))[1]
        
        total_reward += reward
        state = next_state

        critic_loss = 0.5*advantage.pow(2).mean() 
        critic.zero_grad()
        critic_loss.backward() 
        # adam_critic.step() 
        with torch.no_grad():
            for param in critic.parameters():
                param.data -=lr*param.grad 
        
        
         
        actor_loss = -dist.log_prob(action)*advantage.detach()
        actor.zero_grad() 
        actor_loss.backward()
        # adam_actor.step() 
        
        with torch.no_grad(): 
            for param in actor.parameters():
                param.data -=lr*param.grad
            
    episode_rewards.append(total_reward)
    if i%20==0:
        avg20=np.mean(episode_rewards[-20:])
        print(f'episode:{i} score:{total_reward} last_20:{avg20}')
        if avg20>=env.spec.reward_threshold:
            print(f'-------solved in {i} steps-------')
            break

episode:0 score:15.0 last_20:15.0
episode:20 score:15.0 last_20:14.45
episode:40 score:15.0 last_20:19.1
episode:60 score:62.0 last_20:32.25
episode:80 score:24.0 last_20:41.75
episode:100 score:14.0 last_20:48.6
episode:120 score:21.0 last_20:62.35
episode:140 score:19.0 last_20:21.45
episode:160 score:29.0 last_20:23.7
episode:180 score:44.0 last_20:35.1
episode:200 score:50.0 last_20:57.2
episode:220 score:58.0 last_20:70.95
episode:240 score:75.0 last_20:73.45
episode:260 score:46.0 last_20:56.2
episode:280 score:51.0 last_20:44.45
episode:300 score:57.0 last_20:43.2
episode:320 score:52.0 last_20:44.7
episode:340 score:67.0 last_20:72.2
episode:360 score:58.0 last_20:73.2
episode:380 score:126.0 last_20:102.0
episode:400 score:90.0 last_20:57.75
episode:420 score:93.0 last_20:61.15
episode:440 score:136.0 last_20:77.65
episode:460 score:51.0 last_20:61.0
episode:480 score:79.0 last_20:71.35
episode:500 score:97.0 last_20:96.8


In [15]:
avg_r=evaluate_model(env)
print('avg_r',avg_r)

avg_r 79.95


### end of run1

### now run2: using auto_grad

In [16]:
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

actor=NN_Open_Torch(state_dim, 128, n_actions)
critic=NN_Open_Torch(state_dim, 128, 1) 

with torch.no_grad():
    actor.tW1.data=sw1.clone()
    actor.tb1.data=sb1.clone()
    actor.tW2.data=sw2.clone()
    actor.tb2.data=sb2.clone() 
    
    critic.tW1.data=sw1c.clone()
    critic.tb1.data=sb1c.clone()
    critic.tW2.data=sw2c.clone()
    critic.tb2.data=sb2c.clone() 

In [17]:
episode_rewards = []
lr=0.01
for i in range(nepisode+1):
    done = False
    total_reward = 0
    state = env.reset()


    while not done:
        x=t(state ) 
        th, probs=actor.forward(x) 
        dist = torch.distributions.Categorical(probs=probs[0])
        action = dist.sample()
        
        a=action.detach().data.numpy()  
        next_state, reward, done, info = env.step(a)
        yhat=reward + (1-done)*gamma*critic.forward(t(next_state))[1]
        y=critic.forward(t(state))[1]
        advantage = yhat - y
        
        total_reward += reward
        state = next_state

        critic_loss = 0.5*advantage.pow(2).mean() 
        critic.zero_grad()
        critic_loss.backward() 
        # gradsm_critic=grads_manual(yhat,  y, th, state.reshape(1,-1), critic.tW2)   
        gradsm_critic=grads_auto(critic)
        
        with torch.no_grad(): 
            for k,v in gradsm_critic.items():
                critic.ws[k] -=lr*gradsm_critic[k]
 
        actor_loss = -dist.log_prob(action)*advantage.detach()
        actor.zero_grad() 
        actor_loss.backward()
        gradsm_actor=grads_auto(actor)
        
        with torch.no_grad(): 
            for k,v in gradsm_actor.items():
                actor.ws[k] -=lr*gradsm_actor[k]
                
            
    episode_rewards.append(total_reward)
    if i%20==0:
        avg20=np.mean(episode_rewards[-20:])
        print(f'episode:{i} score:{total_reward} last_20:{avg20}')
        if avg20>=env.spec.reward_threshold:
            print(f'-------solved in {i} steps-------')
            break

episode:0 score:15.0 last_20:15.0
episode:20 score:15.0 last_20:14.45
episode:40 score:15.0 last_20:19.1
episode:60 score:62.0 last_20:32.25
episode:80 score:24.0 last_20:41.75
episode:100 score:14.0 last_20:48.6
episode:120 score:21.0 last_20:62.35
episode:140 score:19.0 last_20:21.45
episode:160 score:29.0 last_20:23.7
episode:180 score:44.0 last_20:35.1
episode:200 score:50.0 last_20:57.2
episode:220 score:58.0 last_20:70.95
episode:240 score:75.0 last_20:73.45
episode:260 score:46.0 last_20:56.2
episode:280 score:51.0 last_20:44.45
episode:300 score:57.0 last_20:43.2
episode:320 score:52.0 last_20:44.7
episode:340 score:67.0 last_20:72.2
episode:360 score:58.0 last_20:73.2
episode:380 score:126.0 last_20:102.0
episode:400 score:90.0 last_20:57.75
episode:420 score:93.0 last_20:61.15
episode:440 score:136.0 last_20:77.65
episode:460 score:51.0 last_20:61.0
episode:480 score:79.0 last_20:71.35
episode:500 score:97.0 last_20:96.8


In [18]:
avg_r=evaluate_model(env)
print('avg_r',avg_r)

avg_r 79.95


### end of run2

### Now, run3: actor manual_grad, critic augo_grad

In [19]:
def grads_manual(tP, y_train, th, X, W2, adv, N=1):
    # print('shape:', tP.shape, y_train.shape)
    grads={}
    
    dz2=(tP-y_train)*adv  /N 
    dW2=(dz2.T  @  th) 
    dh=(dz2 @   W2  ) 
    db2=torch.sum(dz2 , axis=0 ) 


    dz1=torch.tensor(dh)   
    dz1[th<=0]=0                     #equal sign is extremely important. 

    dW1=( dz1.T @ X) 
    db1=torch.sum(dz1, axis=0 ) 
    
    grads={'w1':dW1, 'b1':db1, 'w2':dW2, 'b2':db2}
    return grads

In [20]:
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

actor=NN_Open_Torch(state_dim, 128, n_actions)
critic=NN_Open_Torch(state_dim, 128, 1) 

with torch.no_grad():
    actor.tW1.data=sw1.clone()
    actor.tb1.data=sb1.clone()
    actor.tW2.data=sw2.clone()
    actor.tb2.data=sb2.clone() 
    
    critic.tW1.data=sw1c.clone()
    critic.tb1.data=sb1c.clone()
    critic.tW2.data=sw2c.clone()
    critic.tb2.data=sb2c.clone() 

In [21]:
episode_rewards = []
lr=0.01
for i in range(nepisode+1):
    done = False
    total_reward = 0
    state = env.reset()


    while not done:
        x=t(state ) 
        ath, probs=actor.forward(x) 
        # dist = torch.distributions.Categorical(probs=probs[0]) 
        # a=dist.sample().numpy()
        probs_np=probs.detach().numpy().copy()
        a= np.random.choice([0,1],p=probs_np[0])  #why torch.dist.. gives better result?
        
        next_state, reward, done, info = env.step(a)
        yhat=reward + (1-done)*gamma*critic.forward(t(next_state))[1]
        y=critic.forward(t(state))[1]
        advantage = yhat - y
        
        total_reward += reward 

        critic_loss = 0.5*advantage.pow(2).mean() 
        critic.zero_grad()
        critic_loss.backward()    
        gradsm_critic=grads_auto(critic)
        
        with torch.no_grad(): 
            for k,v in gradsm_critic.items():
                critic.ws[k] -=lr*gradsm_critic[k]
 
        yt=np.eye(2)[a]
        yt=yt.reshape(1,-1)
        gradsm_actor=grads_manual(probs.detach(),  t(yt),  ath, state.reshape(1,-1), actor.tW2, advantage)   
        
        with torch.no_grad(): 
            for k,v in gradsm_actor.items():
                actor.ws[k] -=lr*gradsm_actor[k]
        
        state = next_state
            
    episode_rewards.append(total_reward)
    if i%20==0:
        avg20=np.mean(episode_rewards[-20:])
        print(f'episode:{i} score:{total_reward} last_20:{avg20}')
        if avg20>=env.spec.reward_threshold:
            print(f'-------solved in {i} steps-------')
            break

episode:0 score:11.0 last_20:11.0


  dz1=torch.tensor(dh)


episode:20 score:23.0 last_20:18.6
episode:40 score:8.0 last_20:19.95
episode:60 score:9.0 last_20:17.0
episode:80 score:16.0 last_20:26.7
episode:100 score:16.0 last_20:24.25
episode:120 score:12.0 last_20:23.7
episode:140 score:18.0 last_20:28.45
episode:160 score:49.0 last_20:29.45
episode:180 score:60.0 last_20:34.45
episode:200 score:39.0 last_20:39.15
episode:220 score:66.0 last_20:44.95
episode:240 score:21.0 last_20:56.7
episode:260 score:36.0 last_20:62.4
episode:280 score:49.0 last_20:65.15
episode:300 score:45.0 last_20:56.15
episode:320 score:57.0 last_20:85.95
episode:340 score:33.0 last_20:60.5
episode:360 score:62.0 last_20:46.8
episode:380 score:37.0 last_20:46.6
episode:400 score:39.0 last_20:29.4
episode:420 score:119.0 last_20:50.75
episode:440 score:70.0 last_20:70.2
episode:460 score:76.0 last_20:77.65
episode:480 score:54.0 last_20:61.5
episode:500 score:55.0 last_20:64.9


In [22]:
avg_r=evaluate_model(env)
print('avg_r',avg_r)

avg_r 75.35


### end of run3

### run 4: actor, critic both manual grad

### loss not match after episode 420

In [23]:
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

actor=NN_Open_Torch(state_dim, 128, n_actions)
critic=NN_Open_Torch(state_dim, 128, 1) 

with torch.no_grad():
    actor.tW1.data=sw1.clone()
    actor.tb1.data=sb1.clone()
    actor.tW2.data=sw2.clone()
    actor.tb2.data=sb2.clone() 
    
    critic.tW1.data=sw1c.clone()
    critic.tb1.data=sb1c.clone()
    critic.tW2.data=sw2c.clone()
    critic.tb2.data=sb2c.clone() 

In [24]:
def grads_manual_critic(yhat, y, th, th_next, state, next_state, w2, done, N=1):
    grads={}
    
    dz2=(yhat - y)  /N 
    cth=(1-done)*gamma*th_next-th
    dw2=(dz2.T  @  cth) 

    dh=(dz2 @   w2  ) 

    db2=torch.sum(dz2*((1-done)*gamma-1) , axis=0 ) 

    
    # dz1n=torch.tensor(dh)
    dz1n=dh.detach().clone()
    dz1n[th_next<=0]=0                     #equal sign is extremely important. 
    
    
    # dz1=torch.tensor(dh) 
    dz1=dh.detach().clone()
    dz1[th<=0]=0                     #equal sign is extremely important. 

     
    dw1=(1-done)* gamma* dz1n.T @ t(next_state.reshape(1,-1)) - dz1.T @ t(state.reshape(1,-1) ) 
    
    
    db1=torch.sum( ((1-done)*gamma*dz1n -dz1), axis=0 )
    
    grads={'w1':dw1, 'b1':db1, 'w2':dw2, 'b2':db2}
    return grads

In [25]:
episode_rewards = []
lr=0.01
for i in range(nepisode+1):
    done = False
    total_reward = 0
    state = env.reset()


    while not done:
        x=t(state ) 
        ath, probs=actor.forward(x) 
#         dist = torch.distributions.Categorical(probs=probs[0])
#         action = dist.sample() 
#         a=action.detach().data.numpy()  
        
        probs_np=probs.detach().numpy().copy()
        a= np.random.choice([0,1],p=probs_np[0])  #why torch.dist.. gives better result?
        
        
        next_state, reward, done, info = env.step(a)
        
        th_next, y_next=critic.forward(t(next_state))
        yhat=reward + (1-done)*gamma*y_next
        th, y=critic.forward(t(state)) 
        advantage = yhat - y
       
        total_reward += reward  
        gradsm_critic=grads_manual_critic(yhat, y, th, th_next, state, next_state, critic.tW2, done)
        
        with torch.no_grad(): 
            for k,v in gradsm_critic.items():
                critic.ws[k] -=lr*gradsm_critic[k]
  
        yt=np.eye(2)[a].reshape(1,-1) 
        gradsm_actor=grads_manual(probs.detach(),  t(yt),  ath, state.reshape(1,-1), actor.tW2, advantage)   
        
        with torch.no_grad(): 
            for k,v in gradsm_actor.items():
                actor.ws[k] -=lr*gradsm_actor[k]
        
        state = next_state
            
    episode_rewards.append(total_reward)
    if i%20==0:
        avg20=np.mean(episode_rewards[-20:])
        print(f'episode:{i} score:{total_reward} last_20:{avg20}')
        if avg20>=env.spec.reward_threshold:
            print(f'-------solved in {i} steps-------')
            break

episode:0 score:11.0 last_20:11.0


  dz1=torch.tensor(dh)


episode:20 score:23.0 last_20:18.6
episode:40 score:8.0 last_20:19.95
episode:60 score:9.0 last_20:17.0
episode:80 score:16.0 last_20:26.7
episode:100 score:16.0 last_20:24.25
episode:120 score:12.0 last_20:23.7
episode:140 score:18.0 last_20:28.45
episode:160 score:49.0 last_20:29.45
episode:180 score:60.0 last_20:34.45
episode:200 score:39.0 last_20:39.15
episode:220 score:66.0 last_20:44.95
episode:240 score:21.0 last_20:56.7
episode:260 score:36.0 last_20:62.4
episode:280 score:49.0 last_20:65.15
episode:300 score:45.0 last_20:56.15
episode:320 score:57.0 last_20:85.95
episode:340 score:33.0 last_20:60.5
episode:360 score:62.0 last_20:46.8
episode:380 score:37.0 last_20:46.6
episode:400 score:39.0 last_20:29.4
episode:420 score:119.0 last_20:50.75
episode:440 score:70.0 last_20:70.2
episode:460 score:76.0 last_20:77.65
episode:480 score:54.0 last_20:61.5
episode:500 score:55.0 last_20:64.9


In [26]:
avg_r=evaluate_model(env)
print('avg_r',avg_r)

avg_r 75.45


### numpy recreate forward check

In [27]:
class TwoNN():
    #numpy model
    def __init__(self, n_feature, n_hidden, n_output):
        print('simple two layer neural network')
        print(f'creating nn: #input:{n_feature} #hidden:{n_hidden} #output:{n_output}')
        self.n_output=n_output
        self.model={}
        self.model['w1']=np.random.randn(n_hidden, n_feature)/ np.sqrt(n_hidden) # "Xavier" initialization
        self.model['b1']= np.zeros((1,n_hidden))
        self.model['w2']=np.random.randn(n_output, n_hidden)/ np.sqrt(n_output) # "Xavier" initialization
        self.model['b2'] = np.zeros((1,n_output))
 
    def softmax(self, Z): 
        expz = np.exp(Z - np.max(Z))                    #to prevent overflow
        return expz / expz.sum(axis=1, keepdims=True)   #reduce columns for each row.
    
    def forward(self, X):
        """
        X: Nxn_feature
        """
        self.X=X
        self.h=X  @ self.model['w1'].T  + self.model['b1']             #(NxD)@(DxH)  
        self.h[self.h<0]=0                           #relu 
        z=self.h @ self.model['w2'].T + self.model['b2']
        if self.n_output==1: #regression.
            self.out=z
            return self.out
        
        self.out=self.softmax(z)    #(NxH)@(HxO)
        return self.out

In [28]:
def evaluate_model_np(env, actor, num_ep=20):
    np.random.seed(seed)
    env.seed(seed)
    torch.manual_seed(seed)

    """
    run env num_ep times and return average reward.
    """
    rewards = []
    for ep in range(num_ep):
        state = env.reset() 
        ep_reward=0
        for tt in range(2000):
            probs = actor.forward(state.reshape(1,-1)) 
            # action = np.random.choice(n_actions,p=probs[0])
            action=np.argmax(probs[0])

            state, reward, done, _ = env.step(action)
            ep_reward+=reward
            if done:
                # print(f"ep:{ep} reward:{ep_reward}")
                break 
        
        rewards.append(ep_reward)
    return np.array(rewards).mean()

### success: np actor and np critic. <b>(worked)</b>

In [29]:
def grads_manual_np(tP, y_train, th, X, W2, adv, N=1):
    # print('shape:', tP.shape, y_train.shape)
    grads={}
    
    dz2=(tP-y_train)*adv  /N 
    dw2=(dz2.T  @  th) 
    dh=(dz2 @   W2  )  
    db2=np.sum(dz2, axis=0, keepdims=True)  
 
    dh[th<=0]=0                     #equal sign is extremely important. 
 
    dw1 = dh.T @ X.reshape(1,-1)
    db1 = np.sum(dh, axis=0, keepdims=True) 
    grads={'w1':dw1, 'b1':db1, 'w2':dw2, 'b2':db2}
    return grads

In [30]:
def grads_manual_critic_np(yhat, y, th, th_next, state, next_state, w2, done, N=1):
    grads={}
    
    dz2=(yhat - y)  /N 
    cth=(1-done)*gamma*th_next-th
    dw2=(dz2.T  @  cth) 

    dh=(dz2 @   w2  ) 
 
    db2=np.sum(dz2*((1-done)*gamma-1), axis=0, keepdims=True)  
     
    dz1n=dh.copy()
    dz1n[th_next<=0]=0                     #equal sign is extremely important. 
    
     
    dz1=dh.copy()
    dz1[th<=0]=0                     #equal sign is extremely important. 

     
    dw1=(1-done)* gamma* dz1n.T @  next_state.reshape(1,-1) - dz1.T @ state.reshape(1,-1)  
    
    
    db1=np.sum( ((1-done)*gamma*dz1n -dz1), axis=0 , keepdims=True)
    
    grads={'w1':dw1, 'b1':db1, 'w2':dw2, 'b2':db2}
    return grads

In [31]:
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)


actor_np=TwoNN(state_dim, 128, n_actions) 
critic_np=TwoNN(state_dim, 128, 1) 


actor_np.model['w1']=sw1.numpy().copy()
actor_np.model['b1']=sb1.numpy().copy()
actor_np.model['w2']=sw2.numpy().copy()
actor_np.model['b2']=sb2.numpy().copy()
     
    
critic_np.model['w1']=sw1c.numpy().copy()
critic_np.model['b1']=sb1c.numpy().copy()
critic_np.model['w2']=sw2c.numpy().copy()
critic_np.model['b2']=sb2c.numpy().copy()

simple two layer neural network
creating nn: #input:4 #hidden:128 #output:2
simple two layer neural network
creating nn: #input:4 #hidden:128 #output:1


In [32]:
np.random.seed(seed)
env.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f94e65cb490>

In [33]:
episode_rewards = []
lr=0.01
for i in range(nepisode+1):
    done = False
    total_reward = 0
    state = env.reset()


    while not done: 
        probs_np=actor_np.forward(state)
        ath_np=actor_np.h.copy()
        a= np.random.choice([0,1],p=probs_np[0])   
        next_state, reward, done, info = env.step(a)
        total_reward += reward   
        
        y_next=critic_np.forward(next_state.reshape(1,-1) )
        th_next=critic_np.h.copy() 
        y=critic_np.forward(state.reshape(1,-1))
        th=critic_np.h.copy() 
        yhat=reward + (1-done)*gamma*y_next
        
        
        grads_critic_np=grads_manual_critic_np(yhat, y, th, th_next, state, next_state, critic_np.model['w2'], done, N=1)
        
        for k,v in grads_critic_np.items():
            critic_np.model[k] -=lr*grads_critic_np[k]
        
        advantage = yhat - y
        yt=np.eye(2)[a].reshape(1,-1)  
        grads_actor_np=grads_manual_np(probs_np, yt, ath_np, state, actor_np.model['w2'], advantage , N=1)

        for k,v in grads_actor_np.items():
            actor_np.model[k] -=lr*grads_actor_np[k]
        
        
        state = next_state
            
    episode_rewards.append(total_reward)
    if i%20==0:
        avg20=np.mean(episode_rewards[-20:])
        print(f'episode:{i} score:{total_reward} last_20:{avg20}')
        if avg20>=env.spec.reward_threshold:
            print(f'-------solved in {i} steps-------')
            break

episode:0 score:11.0 last_20:11.0
episode:20 score:23.0 last_20:18.6
episode:40 score:8.0 last_20:19.95
episode:60 score:9.0 last_20:17.0
episode:80 score:16.0 last_20:26.7
episode:100 score:16.0 last_20:24.25
episode:120 score:12.0 last_20:23.7
episode:140 score:18.0 last_20:28.45
episode:160 score:49.0 last_20:29.45
episode:180 score:60.0 last_20:34.45
episode:200 score:39.0 last_20:39.15
episode:220 score:66.0 last_20:44.95
episode:240 score:21.0 last_20:56.7
episode:260 score:36.0 last_20:62.4
episode:280 score:49.0 last_20:65.15
episode:300 score:45.0 last_20:56.15
episode:320 score:57.0 last_20:85.95
episode:340 score:33.0 last_20:60.5
episode:360 score:62.0 last_20:46.8
episode:380 score:37.0 last_20:46.6
episode:400 score:39.0 last_20:29.4
episode:420 score:119.0 last_20:50.75
episode:440 score:70.0 last_20:70.2
episode:460 score:76.0 last_20:77.65
episode:480 score:54.0 last_20:61.5
episode:500 score:55.0 last_20:64.9


In [34]:
avg_r=evaluate_model_np(env, actor_np)
print('avg_r',avg_r)

avg_r 75.35


### torch and np grad check starts (worked good)

In [35]:
def grad_check(grad_auto, grad_manual):
    grad_auto=grad_auto.ravel()
    grad_manual=grad_manual.ravel()
    
    diff=(grad_auto -  grad_manual)
    
    er=np.linalg.norm(diff) / ( np.linalg.norm(grad_auto)+np.linalg.norm(grad_manual) )
    
    return er

In [36]:
actor=NN_Open_Torch(state_dim, 128, n_actions)
critic=NN_Open_Torch(state_dim, 128, 1) 

with torch.no_grad():
    actor.tW1.data=sw1.clone()
    actor.tb1.data=sb1.clone()
    actor.tW2.data=sw2.clone()
    actor.tb2.data=sb2.clone() 
    
    critic.tW1.data=sw1c.clone()
    critic.tb1.data=sb1c.clone()
    critic.tW2.data=sw2c.clone()
    critic.tb2.data=sb2c.clone() 

In [37]:
actor_np=TwoNN(state_dim, 128, n_actions)
critic_np=TwoNN(state_dim, 128, 1) 

with torch.no_grad():
    actor_np.model['w1']=sw1.numpy().copy()
    actor_np.model['b1']=sb1.numpy().copy()
    actor_np.model['w2']=sw2.numpy().copy()
    actor_np.model['b2']=sb2.numpy().copy()
    
    critic_np.model['w1']=sw1c.numpy().copy()
    critic_np.model['b1']=sb1c.numpy().copy()
    critic_np.model['w2']=sw2c.numpy().copy()
    critic_np.model['b2']=sb2c.numpy().copy()

simple two layer neural network
creating nn: #input:4 #hidden:128 #output:2
simple two layer neural network
creating nn: #input:4 #hidden:128 #output:1


In [38]:
state=np.array([0.21576258, 0.6811431 , 0.22137272, 0.42765245])
next_state=np.array([0.21576258, 0.6811431 , 0.22137272, 0.42765245])

In [39]:
ath, probs=actor.forward(t(state)) 
action= torch.argmax(probs)   

In [40]:
th_next, y_next=critic.forward(t(next_state))
yhat=reward + (1-done)*gamma*y_next
th, y=critic.forward(t(state)) 
advantage = yhat - y
gradsm_critic=grads_manual_critic(yhat, y, th, th_next, state, next_state, critic.tW2, done)  

In [41]:
advantage = yhat - y
yt=np.eye(2)[action]
yt=yt.reshape(1,-1)
gradsm_actor=grads_manual(probs.detach(),  t(yt),  ath, state.reshape(1,-1), actor.tW2, advantage)   

  dz1=torch.tensor(dh)


In [42]:
probs_np=actor_np.forward(state.reshape(1,-1)) 
action_np = np.argmax(probs_np)

In [43]:
y_next=critic_np.forward(next_state.reshape(1,-1) )
th_next=critic_np.h.copy()
y=critic_np.forward(state.reshape(1,-1))
th=critic_np.h.copy()
yhat=reward + (1-done)*gamma*y_next
advantage_np = yhat - y

In [44]:
grads_np=grads_manual_np(probs_np, yt, actor_np.h, state, actor_np.model['w2'], advantage_np, N=1)

In [45]:
grads_critic_np=grads_manual_critic_np(yhat, y, th, th_next, state, next_state, critic_np.model['w2'], done, N=1)

In [46]:
er=grad_check(gradsm_critic['w2'].detach(), grads_critic_np['w2'])
er

7.756631261872353e-08

In [47]:
er=grad_check(gradsm_critic['b2'].detach(), grads_critic_np['b2'])
er

7.052705008686076e-08

In [48]:
er=grad_check(gradsm_critic['w1'].detach(), grads_critic_np['w1'])
er

6.88389814312544e-08

In [49]:
er=grad_check(gradsm_critic['b1'].detach(), grads_critic_np['b1'])
er

7.283015673474458e-08

In [50]:
er=grad_check(gradsm_actor['w2'].detach(), grads_np['w2'])
er

7.033124909571885e-08

In [51]:
er=grad_check(gradsm_actor['b2'].detach(), grads_np['b2'])
er

7.277702405605708e-08

In [52]:
er=grad_check(gradsm_actor['w1'].detach(), grads_np['w1'])
er

8.2177150716985e-08

In [53]:
er=grad_check(gradsm_actor['b1'].detach(), grads_np['b1'])
er

8.21771484481724e-08

### torch and np grad check end