In [14]:
import torch as t
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f
import numpy as np
import matplotlib.pyplot as plt
import gym
#import gym_cartpole_swingup  #pip install gym_cartpole_swingup

CUDA_LAUNCH_BLOCKING=1

###################################################
# Deep Deterministic Policy Gradient
###################################################

class Noise():
    def __init__(self,mu,sigma=0.15,theta=0.2,dt=1e-2,x0=None):
        self.theta = theta
        self.mu=mu
        self.sigma = sigma
        self.dt=dt
        self.x0=x0
        self.reset()
    
    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
        self.sigma * np.sqrt(self.dt)*np.random.normal(size=self.mu.shape)
        self.prev = x
        return x
    
    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
        

class Actor(nn.Module):
    def __init__(self, dim_state, dim_action, fc1, fc2, alpha, max_action, checkpoint='actor' ):
        super(Actor, self).__init__()
        self.dim_state = dim_state
        self.dim_action = dim_action
        self.max_action = max_action
        
        self.linear1 = nn.Linear(dim_state, fc1)
        self.linear2 = nn.Linear(fc1, fc2)
        self.linear3 = nn.Linear(fc2, dim_action)
        
        self.device = t.device('cpu' if t.cuda.is_available() else 'cpu')
        self.optimizer = optim.Adam(self.parameters(), lr=alpha, weight_decay=1e-4)
        self.to(self.device)
        self.path = checkpoint
        
    def forward(self, state):
        output = state.to(self.device)
        
        output = f.relu(self.linear1(output))
        output = f.relu(self.linear2(output))
        output = t.tanh(self.linear3(output)) * self.max_action
        
        return output
    
    def saveCheckpoint(self):
        print ('...saving check point...')
        t.save(self.state_dict(), self.path)
    
    def loadCheckpoint(self):
        print('...loading check point')
        self.load_state_dict(t.load(self.path))
    
class Critic(nn.Module):   
    def __init__(self, dim_state, dim_action, fc1, fc2, alpha,checkpoint='critic'):
        super(Critic, self).__init__()
        self.dim_state = dim_state
        self.dim_action = dim_action
        self.path = checkpoint
        
        self.linear1 = nn.Linear(dim_state+ dim_action, fc1)
        self.linear2 = nn.Linear(fc1, fc2)
        self.linear3 = nn.Linear(fc2,1)
        self.optimizer = optim.Adam(self.parameters(), lr = alpha, weight_decay=1e-4)
        self.device = t.device('cpu' if t.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self, state, action):
        state = state.to(self.device)
        action = action.to(self.device)
        
        output = t.relu(self.linear1(t.cat((state, action), dim=1)))
        output = t.relu(self.linear2(output))
        output = self.linear3(output)
        
        return output
    
    def saveCheckpoint(self):
        print('...save checkout...')
        t.save(self.state_dict(), self.path)
    
    def loadCheckpoint(self):
        print ('...load checj point...')
        self.load_state_dict(t.load(self.path))
        
class ReplayBuffer():
    def __init__(self, maxm_size, dim_state, dim_action):
        self.counter = 0
        self.state_mem = np.zeros((maxm_size, dim_state))
        self.action_mem = np.zeros((maxm_size, dim_action))
        self.reward_mem = np.zeros(maxm_size)
        self.state_new_mem = np.zeros((maxm_size, dim_state))
        self.done_mem = np.zeros(maxm_size)
        self.maxm_size = maxm_size
        
    def store_Transaction(self, state, action, reward, state_new, done):
        index = self.counter % self.maxm_size
        
        self.state_mem[index] = state
        self.action_mem[index] = action
        self.reward_mem[index] = reward
        self.state_new_mem[index] =state_new
        self.done_mem[index] = 1.0 - done
        
        self.counter+=1
    
    def sample_batch(self, batch_size=126):
        maxm_size = min(self.counter, self.maxm_size)
        batch = np.random.choice(maxm_size, batch_size)
        
        state_batch = self.state_mem[batch]
        action_batch = self.action_mem[batch]
        reward_batch = self.reward_mem[batch]
        state_new_batch = self.state_new_mem[batch]
        done_batch = self.done_mem[batch]
        
        return state_batch, action_batch,reward_batch,state_new_batch,done_batch

class Agent():
    def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99,
                 max_size=1000000, fc1_dims=400, fc2_dims=300, 
                 batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.alpha = alpha
        self.beta = beta

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)

        self.noise = Noise(mu=np.zeros(n_actions))
        
        #         self, dim_state, dim_action, fc1, fc2, alpha, max_action, checkpoint='actor' ):
        self.actor = Actor(alpha, input_dims, fc1_dims, fc2_dims, dim_action=n_actions, name='actor')
        self.critic = Critic(beta, input_dims, fc1_dims, fc2_dims, dim_action=n_actions, name='critic')

        self.target_actor = Actor(alpha, input_dims, fc1_dims, fc2_dims, dim_action=n_actions, name='target_actor')

        self.target_critic = Critic(beta, input_dims, fc1_dims, fc2_dims, dim_action=n_actions, name='target_critic')

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        #self.actor.eval()
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(state).to(self.actor.device)
        mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to(self.actor.device)
        #self.actor.train()

        return mu_prime.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

    def learn(self):
        #if self.memory.mem_cntr < self.batch_size:
            #return
        
        if self.buffer.counter > self.batch_size:
            states, actions, rewards, states_, done = \
                    self.memory.sample_batch(self.batch_size)

            states = T.tensor(states, dtype=T.float).to(self.actor.device)
            states_ = T.tensor(states_, dtype=T.float).to(self.actor.device)
            actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
            rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
            done = T.tensor(done).to(self.actor.device)

            target_actions = self.target_actor.forward(states_)
            critic_value_ = self.target_critic.forward(states_, target_actions)
            critic_value = self.critic.forward(states, actions)

            critic_value_[done] = 0.0
            critic_value_ = critic_value_.view(-1)

            target = rewards + self.gamma*critic_value_
            target = target.view(self.batch_size, 1)

            self.critic.optimizer.zero_grad()
            critic_loss = F.mse_loss(target, critic_value)
            critic_loss.backward()
            self.critic.optimizer.step()

            self.actor.optimizer.zero_grad()
            actor_loss = -self.critic.forward(states, self.actor.forward(states))
            actor_loss = T.mean(actor_loss)
            actor_loss.backward()
            self.actor.optimizer.step()

            self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_state_dict = dict(target_critic_params)
        target_actor_state_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                (1-tau)*target_critic_state_dict[name].clone()

        for name in actor_state_dict:
             actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                 (1-tau)*target_actor_state_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)
        self.target_actor.load_state_dict(actor_state_dict)
    

if __name__ == '__main__':
    env = gym.make('Pendulum-v1')
    agent = Agent(alpha=0.0001, beta=0.001, input_dims=env.observation_space.shape[0], tau=0.001,
                    batch_size=64, fc1_dims=400, fc2_dims=300, n_actions=env.action_space.shape[0])
    n_games = 1000

    score_history = []
    for i in range(n_games):
        observation = env.reset()
        done = False
        score = 0
        agent.noise.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            agent.remember(observation, action, reward, observation_, done)
            agent.learn()
            score += reward
            observation = observation_
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        print('episode ', i, 'score %.1f' % score,
                'average score %.1f' % avg_score)
    x = [i+1 for i in range(n_games)]
    plot_learning_curve(x, score_history, figure_file)

        

TypeError: __init__() got multiple values for argument 'dim_action'