In [4]:
import os
import numpy as np
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import gym
import numpy as np
import matplotlib.pyplot as plt

In [5]:
class OU(object):
    def __init__(self,mu,sigma=0.2,theta=0.15,dt=1e-2,x0=None):
        self.mu=mu
        self.sigma=sigma
        self.theta=theta
        self.dt=dt
        self.x0=x0
        self.reset()
    
    def __call__(self):
        x=self.x_prev+self.theta*(self.mu-self.x_prev)*self.dt+self.sigma*np.sqrt(self.dt)*np.random.normal(size=self.mu.shape)
        self.x_prev=x
        return x
    
    def reset(self):
        if(self.x0 is not None):
            self.x_prev=self.x0
        else:
            self.x_prev=np.zeros_like(self.mu)

In [6]:
class Buffer(object):
    def __init__(self, buff_size, obs_shape, n_a):
        self.max_size=buff_size
        self.counter=0
        self.s_mem=np.zeros((self.max_size, *obs_shape))
        self.s_dash_mem=np.zeros((self.max_size, *obs_shape))
        self.a_mem=np.zeros((self.max_size, n_a))
        self.r_mem=np.zeros(self.max_size)
        self.d_mem=np.zeros(self.max_size,dtype=np.float32)
    
    def buff_update(self, s, s_dash, a, r, done):
        self.s_mem[self.counter%self.max_size]=s
        self.s_dash_mem[self.counter%self.max_size]=s_dash
        self.a_mem[self.counter%self.max_size]=a
        self.r_mem[self.counter%self.max_size]=r
        self.d_mem[self.counter%self.max_size]=(1-done)
        self.counter+=1
    
    def sample_buff(self, batch_size):
        max_mem=min(self.counter,self.max_size)
        batch=np.random.choice(max_mem, batch_size)
        s=self.s_mem[batch]
        s_dash=self.s_dash_mem[batch]
        a=self.a_mem[batch]
        r=self.r_mem[batch]
        d=self.d_mem[batch]
        return s,s_dash,a,r,d

In [7]:
class CriticNet(nn.Module):
    def __init__(self, beta, input_dims, z1_dims, z2_dims, n_a, name, checkpt_dir='Weights'):
        super(CriticNet, self).__init__()
        self.input_dims=input_dims
        self.z1_dims=z1_dims
        self.z2_dims=z2_dims
        self.n_a=n_a
        self.checkpoint_file=os.path.join(checkpt_dir, name+'_ddpg')
        self.z1=nn.Linear(*self.input_dims, self.z1_dims)                           ###
        z1l=1/np.sqrt(self.z1.weight.data.size()[0])
        T.nn.init.uniform_(self.z1.weight.data,-z1l,z1l)
        T.nn.init.uniform_(self.z1.bias.data,-z1l,z1l)
        self.zb1=nn.LayerNorm(self.z1_dims)
        
        self.z2=nn.Linear(self.z1_dims, self.z2_dims)
        z2l=1/np.sqrt(self.z2.weight.data.size()[0])
        T.nn.init.uniform_(self.z2.weight.data, -z2l, z2l)
        T.nn.init.uniform_(self.z2.bias.data, -z2l, z2l)
        self.zb2=nn.LayerNorm(self.z2_dims)
        
        self.action_value=nn.Linear(self.n_a,z2_dims)
        self.q=nn.Linear(self.z2_dims,1)
        T.nn.init.uniform_(self.q.weight.data, -0.003, 0.003)
        T.nn.init.uniform_(self.q.bias.data, -0.003, 0.003)
        self.optimizer=optim.Adam(self.parameters(), lr=beta)
    
    def forward(self, s, a):
        temp=self.z1(s)
        temp=self.zb1(temp)
        temp=F.relu(temp)
        temp=self.z2(temp)
        temp=self.zb2(temp)
        state_value=temp
        action_value=F.relu(self.action_value(a))
        state_action_value=F.relu(T.add(state_value, action_value))
        state_action_value=self.q(state_action_value)
        return state_action_value
    
    def save_w(self):
        print('Saving ')
        T.save(self.state_dict(),self.checkpoint_file)
    def load_w(self):
        print('Loading ')
        self.load_state_dict(T.load(self.checkpoint_file))

In [8]:
class ActorNet(nn.Module):
    def __init__(self, alpha, input_dims, z1_dims, z2_dims, n_a, name, checkpt_dir='Weights'):
        super(ActorNet, self).__init__()
        self.input_dims=input_dims
        self.z1_dims=z1_dims
        self.z2_dims=z2_dims
        self.n_a=n_a
        self.checkpoint_file=os.path.join(checkpt_dir, name+'_ddpg')
        self.z1=nn.Linear(*self.input_dims, self.z1_dims)                           ###
        z1l=1/np.sqrt(self.z1.weight.data.size()[0])
        T.nn.init.uniform_(self.z1.weight.data,-z1l,z1l)
        T.nn.init.uniform_(self.z1.bias.data,-z1l,z1l)
        self.zb1=nn.LayerNorm(self.z1_dims)
        
        self.z2=nn.Linear(self.z1_dims, self.z2_dims)
        z2l=1/np.sqrt(self.z2.weight.data.size()[0])
        T.nn.init.uniform_(self.z2.weight.data, -z2l, z2l)
        T.nn.init.uniform_(self.z2.bias.data, -z2l, z2l)
        self.zb2=nn.LayerNorm(self.z2_dims)
        
        self.mu=nn.Linear(z2_dims,self.n_a)
        T.nn.init.uniform_(self.mu.weight.data, -0.003, 0.003)
        T.nn.init.uniform_(self.mu.bias.data, -0.003, 0.003)
        self.optimizer=optim.Adam(self.parameters(), lr=alpha)
        
    def forward(self, s):
        temp=self.z1(s)
        temp=self.zb1(temp)
        temp=F.relu(temp)
        temp=self.z2(temp)
        temp=self.zb2(temp)
        temp=T.tanh(self.mu(F.relu(temp)))
        return temp
    
    def save_w(self):
        print('Saving ')
        T.save(self.state_dict(),self.checkpoint_file)
    def load_w(self):
        print('Loading ')
        self.load_state_dict(T.load(self.checkpoint_file))

In [25]:
class Agent(object):
    def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, n_a=2, max_size=1000000, layer1_size=400, layer2_size=300, batch_size=64):
        self.gamma=gamma
        self.tau=tau
        self.memory=Buffer(max_size, input_dims, n_a)
        self.batch_size=batch_size
        self.actor=ActorNet(alpha, input_dims, layer1_size, layer2_size, n_a=n_a, name='Actor')
        self.target_actor=ActorNet(alpha, input_dims, layer1_size, layer2_size, n_a=n_a, name='TargetActor')
        self.critic=CriticNet(beta, input_dims, layer1_size, layer2_size, n_a=n_a, name='Critic')
        self.target_critic=CriticNet(beta, input_dims, layer1_size, layer2_size, n_a=n_a, name='TargetCritic')
        self.noise=OU(mu=np.zeros(n_a))
        self.update_network_parameters(tau=tau)
    
    def choose_action(self, obs):
        self.actor.eval()
        obs=T.tensor(obs, dtype=T.float32)
        mu=self.actor(obs)
        # print(self.noise())
        mu_prime=mu+T.tensor(self.noise(), dtype=T.float)
        self.actor.train()
        return mu_prime.cpu().detach().numpy()
    
    def store_buff(self, s, s_dash, a, r, d):
        self.memory.buff_update(s, s_dash, a, r, d)
        
    def learn(self):
        if self.memory.counter<self.batch_size:
            return
        s,s_dash,a,r,d=self.memory.sample_buff(self.batch_size)
        r=T.tensor(r, dtype=T.float)
        d=T.tensor(d)
        s_dash=T.tensor(s_dash, dtype=T.float)
        a=T.tensor(a, dtype=T.float)
        s=T.tensor(s, dtype=T.float)
        self.target_actor.eval()
        self.critic.eval()
        target_actions=self.target_actor.forward(s_dash)
        critic_value=self.target_critic.forward(s_dash, target_actions)
        critic_value=self.critic.forward(s,a)
        target=[]
        for j in range(self.batch_size):
            target.append(r[j]+self.gamma*critic_value[j]*d[j])
        target=T.tensor(target)
        target=target.view(self.batch_size,1)
        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_loss=F.mse_loss(target,critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()
        self.critic.eval()
        self.actor.optimizer.zero_grad()
        mu=self.actor.forward(s)
        self.actor.train()
        actor_loss=-self.critic.forward(s,mu)
        actor_loss=T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()
        self.update_network_parameters()
        
    def update_network_parameters(self, tau=None):
        if tau is None:
            tau=self.tau
        actor_params=self.actor.named_parameters()
        critic_params=self.critic.named_parameters()
        target_actor_params=self.target_actor.named_parameters()
        target_critic_params=self.target_critic.named_parameters()
        
        actor_state_dict=dict(actor_params)
        critic_state_dict=dict(critic_params)
        target_actor_dict=dict(target_actor_params)
        target_critic_dict=dict(target_critic_params)
        
        for i in actor_state_dict:
            actor_state_dict[i]=tau*actor_state_dict[i].clone()+(1-tau)*target_actor_dict[i].clone()
        for i in critic_state_dict:
            critic_state_dict[i]=tau*critic_state_dict[i].clone()+(1-tau)*target_critic_dict[i].clone()
        self.target_actor.load_state_dict(actor_state_dict)
        self.target_critic.load_state_dict(critic_state_dict)
    
    def save_w(self):
        self.actor.save_w()
        self.critic.save_w()
        self.target_actor.save_w()
        self.target_critic.save_w()
        
    def load_w(self):
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.target_critic.load_checkpoint()

In [27]:
env=gym.make('Pendulum-v0')
print(env.action_space,env.observation_space)
agent=Agent(alpha=1e-4, beta=1e-3, input_dims=[3], tau=0.001, env=env, batch_size=64, layer1_size=40, layer2_size=30, n_a=1)
np.random.seed(0)
score=[]
for i in range(700):
    done=False
    sc=0
    s=env.reset()
    while not done:
        a=agent.choose_action(s)
        s_dash,r,done,info=env.step(a)
        agent.store_buff(s, s_dash, a, r, int(done))
        agent.learn()
        sc+=r
        s=s_dash
    score.append(sc)
    print(i,sc,np.mean(score[-100:]))
    # if(i%100==0):
    #     agent.save_w()
s=env.reset()
done=False
while not done:
    a=agent.choose_action(s)
    s_dash,r,done,info=env.step(a)
    agent.store_buff(s, s_dash, a, r, int(done))
    sc+=r
    s=s_dash
    env.render()
    time.sleep(0.01)
score.append(sc)
print(i,sc,np.mean(score[-100:]))

Box(-2.0, 2.0, (1,), float32) Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
0 -1257.619388462724 -1257.619388462724
1 -916.9092904107184 -1087.2643394367212
2 -1218.1588904800906 -1130.8958564511777
3 -1215.1051741130216 -1151.9481858666386
4 -1273.1064396920858 -1176.179836631728
5 -849.798559725056 -1121.7829571472828
6 -1242.0439792353495 -1138.9631031598637
7 -1173.0606999176644 -1143.2253027545887
8 -1334.77404298155 -1164.50849611314
9 -1444.9528334495958 -1192.5529298467854
10 -859.693482703714 -1162.2929801065063
11 -1568.2523138094502 -1196.1229245817515
12 -1669.7775951419976 -1232.557899240232
13 -885.7654250070452 -1207.7870082235756
14 -1113.2455534382464 -1201.4842445712204
15 -1338.6890889485362 -1210.059547344803
16 -1761.4956734080624 -1242.4969665249946
17 -1038.6954411312774 -1231.174659558677
18 -1371.4947945952367 -1238.5599298237591
19 -966.4001646268646 -1224.9519415639145
20 -934.37493890632 -1211.1149414373624
21 -1274.4165914567204 -1213.9922891655149
22 -1408

ValueError: expected sequence of length 3 at dim 1 (got 0)