In [2]:
import os 
import numpy as np 
import tensorflow as tf 



In [None]:
class OUActionNoise(object):
    def __init__(self, mu, sigma = 0.15, theta = 0.2, dt = 1e-2 , x0 = None):
        self.theta = theta 
        self.mu = mu
        self.dt = dt 
        self.sigma = sigma 
        self.x0 = x0 
        self.reset()
    
    def __call__(self):
        # Noise = OUActionNoise()
        # ournoise = Noise() --> This is why __call__() is used.
        x = self.x_prev + self.theta*(self.mu - self.x_prev)*self.dt +\
            self.sigma*np.sqrt(self.dt)*np.random.normal(size = self.mu.shape)
        self.x_prev = x 
        return x 
    
    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
    

class ReplayBuffer():
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size 
        self.mem_cntr = 0 
        self.state_memory = np.zeros((self.mem_size * input_shape))
        self.new_state_memory = np.zeros((self.mem_size * input_shape))
        self.action_memory = np.zeros((self.mem_size * n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size , dtype = np.float32)

    def store_transition(self, state, action, reward, next_state, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = next_state
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = 1 - int(done)
        self.mem_cntr += 1 
    
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr , self.mem_size)
        batch = np.random.choice(max_mem , batch_size)

        states = self.state_memory[batch]
        new_states = self.new_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, actions, rewards, new_states, terminal


class Actor(object):
    def __init__(self , lr , n_actions, name, input_dims, fc1_dims,fc2_dims, action_bound
                , batch_size = 64, chkpt_dir = 'tmp/ddpg'):
        self.lr = lr 
        self.n_actions = n_actions 
        self.name = name 
        self.fc1_dims = fc1_dims 
        self.fc2_dims = fc2_dims 
        self.batch_size = batch_size
        self.action_bound = action_bound 
        self.chkpt_dir = chkpt_dir
        self.build_network()
        self.params = tf.trainable_varaibles(scope = self.name)
        self.saver = tf.train.Saver()
        self.checkpoint_file = os.path.join(chkpt_dir , name+'_ddpg,ckpt')


        self.unnormalized_actor_gradients = tf.gradients(self.mu , self.params, -self.action_gradients)

        self.actor_gradients = list(map(lambda x:tf.div(x,self.batch_size), self.unnormalized_actor_gradients))
        self.optimize = tf.train.AdamOptimizer(self.lr).apply_gradients(zip(self.actor_gradients , self.params))

    def build_network(self):
        with tf.variable_scope(self.name):
            self.input = tf.placeholder(tf.float32 , shape = [None, *self.input_dims] , name = 'inputs')
            self.action_gradient = tf.placeholder

     

