<a href="https://colab.research.google.com/github/milad88/Thesis/blob/master/TRPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
from collections import namedtuple
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import gc
import itertools
from math import isnan
EpisodeStats = namedtuple("Stats", ["episode_lengths", "episode_rewards"])
batch_size = 32


def var_shape(x):
    try:
        out = [k for k in x.shape]
    except:
        out = [1]
    assert all(isinstance(a, int) for a in out), \
        "shape function assumes that shape is fully known"
    return out

def flatgrad(loss, var_list):
    grads = tf.gradients(loss, var_list)
    grads = [0. + 1e-16 if g is None else g for g in grads]
    grads = np.concatenate([np.reshape(grad, [np.size(v)]) for (v, grad) in zip(var_list, grads)], 0)
    return grads


def gauss_log_prob(mu, logstd, x):
    var = tf.exp(2 * logstd)
    gp = -tf.square(x - mu) / (2 * var) - .5 * tf.log(tf.constant(2 * np.pi)) - logstd
    return tf.reduce_sum(gp, [1])


def gauss_prob(mu, std, xs):
    var = std ** 2
    return tf.exp(-tf.square(xs - mu) / (2 * var)) / (tf.sqrt(tf.constant(2 * np.pi)) * std)

# KL divergence between two paramaterized guassian distributions
def gauss_KL(mu1, std1, mu2, std2):
    var1 = std1 ** 2
    var2 = std2 ** 2

    kl = tf.reduce_sum(tf.log(std2) - tf.log(std1) + (var1 + tf.square(mu1 - mu2)) / (2 * var2) - 0.5)

    return kl


def gauss_ent(mu, std):
    h = tf.reduce_sum(tf.log(std) + tf.constant(0.5 * np.log(2 * np.pi * np.e), tf.float32))
    return h


def hessian_vec_bk(ys, xs, vs, grads=None):
    """Implements Hessian vector product using backward on backward AD.
  Args:
    ys: Loss function.
    xs: Weights, list of tensors.
    vs: List of tensors to multiply, for each weight tensor.
  Returns:
    Hv: Hessian vector product, same size, same shape as xs.
  """
    # Validate the input

    if type(xs) == list:
        if len(vs) != len(xs):
            raise ValueError("xs and vs must have the same length.")

    if grads is None:
        grads = tf.gradients(ys, xs, gate_gradients=True)
    return tf.gradients(grads, xs, vs, gate_gradients=True)



def plot_stats(stats):
    fig11 = plt.figure(figsize=(10, 5))

    plt.plot(np.ravel(stats))
    plt.xlabel("Episode")
    plt.ylabel("loss per episode")
    plt.show(fig11)


def plot_episode_stats(stats, smoothing_window=10, noshow=False):
    # Plot the episode reward over time
    fig2 = plt.figure(figsize=(10, 5))
    #rewards_smoothed = pd.Series(stats.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
    plt.plot(stats.episode_rewards)
    plt.xlabel("Episode")
    plt.ylabel("Episode Reward (Smoothed)")
    plt.title("Episode Reward over Time (Smoothed over window size {})".format(smoothing_window))
    fig2.savefig('reward.png')
    if noshow:
        plt.close(fig2)
    else:
        plt.show(fig2)


class SetFromFlat(object):

    def __init__(self, session, var_list):
        self.session = session
        self.var_list = var_list
        self.shapes = map(var_shape, var_list)
        total_size = sum(np.prod(shape) for shape in self.shapes)
        self.theta = theta = tf.placeholder(tf.float32, [total_size],name="theta_sff")
        start = 0
        assigns = []
        for (shape, v) in zip(self.shapes, self.var_list):
            size = np.prod(shape)
            assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape)))
            start += size

        self.op = assigns

    def __call__(self, theta):
        self.session.run(self.op, feed_dict={self.theta: theta})


class ReplayBuffer:
    # Replay buffer for experience replay. Stores transitions.
    def __init__(self):
        self._data = namedtuple("ReplayBuffer", ["states", "actions", "next_states", "rewards", "dones"])
        self._data = self._data(states=[], actions=[], next_states=[], rewards=[], dones=[])
        self.position = 0
        self.capacity = batch_size * 4

    def add_transition(self, state, action, next_state, reward, done):

        if np.array(state).shape == (3, 1):
            state = list(itertools.chain.from_iterable(state))

        if np.array(next_state).shape == (3, 1):
            next_state = list(itertools.chain.from_iterable(next_state))

        if len(self._data.states) < self.capacity:
            self._data.states.append(None)
            self._data.actions.append(None)
            self._data.next_states.append(None)
            self._data.rewards.append(None)
            self._data.dones.append(None)

        self._data.states[self.position] = state
        self._data.actions[self.position] = action
        self._data.next_states[self.position] = next_state
        self._data.rewards[self.position] = reward
        self._data.dones[self.position] = done
        self.position = (self.position + 1) % self.capacity

    def next_batch(self, batch_size):

        self.capacity = batch_size * 4
        if batch_size >= len(self._data.states):
            return np.array(self._data.states), np.array(self._data.actions), np.array(
                self._data.next_states), np.array(self._data.rewards), np.array(self._data.dones)
        batch_indices = np.random.choice(len(self._data.states), batch_size)
        batch_states = np.array([self._data.states[i] for i in batch_indices])
        batch_actions = np.array([self._data.actions[i] for i in batch_indices])
        batch_next_states = np.array([self._data.next_states[i] for i in batch_indices])
        batch_rewards = np.array([self._data.rewards[i] for i in batch_indices])
        batch_dones = np.array([self._data.dones[i] for i in batch_indices])

        return batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones

    #        if self.transition_size() > 5*batch_size:
    #           self._data.states = self._data.states[-5*batch_size:]
    #          self._data.actions = self._data.actions[-5*batch_size:]
    #         self._data.next_states = self._data.next_states[-5*batch_size:]
    #       self._data.dones = self._data.dones[-5*batch_size:]
    #      self._data.rewards = self._data.rewards[-5*batch_size:]


    def transition_size(self):
        return len(self._data.states)


In [0]:
import itertools
action_bound = 2.0

class Actor_Net():
  def __init__(self, num_actions, action_dim, name, action_bound, state_dim, learning_rate=0.01, batch_size=32):
        # super().__init__(num_actions, name)
        self.learning_rate = learning_rate
        self.action_bound = action_bound
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.batch_size = batch_size
        self.name = name
        self._build_model(num_actions)

  def _build_model(self, num_actions):

        self.inp = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float32, name="states")
        self.actions = tf.placeholder(shape=[None, self.action_dim], dtype=tf.float32, name="actions")

        self.advantage = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="advantages")
        self.old_mean = tf.placeholder(dtype=tf.float32, name="old_mean")
        self.old_sigma = tf.placeholder(dtype=tf.float32, name="old_sigma")
        self.p = tf.placeholder(tf.float32, name="p")  # the vector

        self.inpW = tf.Variable(tf.random_uniform([self.state_dim, 16], -0.5, 0.5))
        self.inpB = tf.Variable(tf.constant(0.1, shape=[16]))
        self.h1 = tf.nn.relu(tf.matmul(self.inp, self.inpW) + self.inpB)

        self.h2W = tf.Variable(tf.random_uniform([16, 32], -0.5, 0.5))
        self.h2B = tf.Variable(tf.constant(0.1, shape=[32]))
        self.h2 = tf.nn.relu(tf.matmul(self.h1, self.h2W) + self.h2B)

        self.h3W = tf.Variable(tf.random_uniform([32, 16], -0.5, 0.5))
        self.h3B = tf.Variable(tf.constant(0.1, shape=[16]))
        self.h3 = tf.nn.relu(tf.matmul(self.h2, self.h3W) + self.h3B)

        self.h4W = tf.Variable(tf.random_uniform([16, self.action_dim], -0.5, 0.5))

        self.outB = tf.Variable(tf.constant(0.01, shape=[self.action_dim]))

        self.net_params = tf.trainable_variables(scope=self.name)

        self.mean = tf.nn.tanh(tf.matmul(self.h3, self.h4W) + self.outB)
        self.mean = self.mean * self.action_bound

        self.sigma = tf.nn.relu(tf.matmul(self.h3, self.h4W) + self.outB)

        self.net_params = tf.trainable_variables(scope=self.name)

        self.sigma = tf.clip_by_value(t=self.sigma,
                                      clip_value_min=0,
                                      clip_value_max=tf.sqrt(self.action_bound))
        self.scaled_out = tf.truncated_normal(mean=self.mean, stddev=self.sigma, shape=[self.action_dim])
        self.prev_mean = 0.
        self.prev_sigma = 1.
        #self.cost = gauss_KL(self.mean, self.sigma, self.prev_mean, self.prev_sigma)
        self.cost = tf.reduce_sum((gauss_prob(self.mean, self.sigma, self.scaled_out) * self.advantage) /
                        (gauss_prob(self.prev_mean, self.prev_sigma, self.scaled_out)) + 1e-10)

        self.grads = tf.gradients(self.cost, self.net_params)

        self.shapes = [v.shape.as_list() for v in self.net_params]
        #self.size_theta = np.sum([np.prod(shape) for shape in self.shapes])

        tangents = []
        start = 0
        for shape in self.shapes:
            size = np.prod(shape)
            tangents.append(tf.reshape(self.p[start:start + size], shape))
            start += size
        # self.gvp = tf.add_n([tf.reduce_sum(g * tangent) for (g, tangent) in zip(grads, tangents)])
        self.gvp = [(tf.reduce_sum(g * t)) for (g, t) in zip(self.grads, tangents)]
        # 2nd gradient of KL w/ itself * tangent

        self.hvp = flatgrad(self.gvp, self.net_params)

        self.saver = tf.train.Saver()

  def conjugate_gradient(self, f_Ax, b, cg_iters=5, residual_tol=1e-5):
        p = b.copy()
        r = b.copy()
        x = np.zeros_like(b)
        rdotr = r.dot(r)
        for i in range(cg_iters):
            z = f_Ax(p)
            v = rdotr / p.dot(z) # p.dot(z)  # stepdir size?? =ak of wikipedia
            x += np.dot(v,p)
            # x += v * p  # new parameters??
            r -= z.dot(v)  # new gradient??
            newrdotr = np.dot(r, r)  #
            if newrdotr < residual_tol:
                break

            mu = newrdotr / rdotr  # Bi of wikipedia
            rdotr = newrdotr
            p = r + mu * p

        return x

  def linesearch(self, f, x, fullstepdir, expected_improve_rate, max_iter=5):
        '''
        :param f: loss fuction
        :param x: parameters
        :param fullstepdir: value returned by conjugate gradient * Hg-1 ... delta kappa estimated by the conjugate gradient
        :param expected_improve_rate:
        :return:
        '''
        j = max_iter
        accept_ratio = .1
        max_backtracks = 10

        fval = f(x)
        for (_n_backtracks, stepdirfrac) in enumerate(.5 ** np.arange(max_backtracks)):
            j -= 1
            xnew = x + (stepdirfrac * fullstepdir)
            newfval = f(xnew)
            actual_improve = fval - newfval
            expected_improve = expected_improve_rate * stepdirfrac
            ratio = actual_improve / expected_improve
            if ratio > accept_ratio and actual_improve > 0 or j == 0:
                return xnew

        return x


  def predict(self, sess, states):
        """
        Args:
          sess: TensorFlow session
          states: array of states for which we want to predict the actions.
        Returns:
          The prediction of the output tensor.
        """
        if states[-1].shape == (1,):
            if len(states) == 3:
                states = np.array(np.ravel(states))
            else:
                states = states[:-3]

        states = np.atleast_2d(states)
        np.reshape(states, [len(states), 3])
        # print(states.shape)
        feed = {self.inp: states}
        prediction = sess.run(self.scaled_out, feed)

        return prediction

    # action gradient to be fed

  def update(self, sess, states, actions, advantages, summary, first):
        """
        Updates the weights of the neural network, based on its targets, its
        predictions, its loss and its optimizer.

        Args:
          sess: TensorFlow session.
          states: [current_state] or states of batch
          actions: [current_action] or actions of batch
          targets: [current_target] or targets of batch
        """
        states = np.atleast_2d(states)
        states = np.reshape(states, [len(states), 3])

        #feed_dict = {self.inp: states, self.actions: actions}
        #mean, sigma, scaled_out = sess.run((self.mean, self.sigma, self.scaled_out), feed_dict)

        feed_dict = {self.inp: states, self.actions: actions,
                     self.old_mean: self.prev_mean, self.old_sigma: self.prev_sigma,
                     self.advantage: advantages}

        self.prev_mean, self.prev_sigma,_, _, net, grads = sess.run(
                    (self.mean, self.sigma, self.scaled_out, self.cost, self.net_params, self.grads), feed_dict)

        grads = np.concatenate([np.reshape(grad, [np.size(v)]) for (v, grad) in zip(net, grads)], 0)
        grads = np.where(np.isnan(grads), 1e-16, grads)

        #self.sff = SetFromFlat(sess, net)

        def get_hvp(p):
            feed_dict[self.p] = p  # np.reshape(p, [np.size(p),1])
            gvp = sess.run(self.gvp, feed_dict)
            gvp = np.where(np.isnan(gvp), 0, gvp)
            #with tf.control_dependencies(self.gvp):
            a = tf.gradients(gvp, self.net_params)
            a = [0 if k is None else  k for k in a]
#            a = np.concatenate([np.reshape(grad, [np.size(v)]) for (v, grad) in zip(net, a)], 0)

            return np.sum((1e-3 * np.reshape(p, [np.size(p), 1])) + np.reshape(a, [1, np.size(a)]), 1)

            # return np.array(flatgrad(self.gvp, self.net_params))# + 1e-3 * p
        
        self.cg = self.conjugate_gradient(get_hvp, -grads)
        self.stepdir = np.sqrt(2 * self.learning_rate / (np.transpose(grads) * self.cg) + 1e-16) * self.cg

        def loss(th):
            #th = np.concatenate([np.reshape(g,[-1]) for g in th],0)
            #self.sff(th)
            start = 0
            i = 0
            for (shape, v) in zip(self.shapes, self.net_params):
                size = np.prod(shape)
                self.net_params[i] = tf.reshape(th[start:start + size], shape)
                start += size
                i += 1
            # surrogate loss: policy gradient loss
            return sess.run(self.cost, feed_dict)

        stepsize = self.linesearch(loss, np.concatenate([np.reshape(g,[-1]) for g in net],0), self.stepdir, self.cg.dot(self.stepdir))
        #del self.sff
        # self.net_params = sess.run(tf.assign(self.net_params, self.net_params + self.stepdir))#+ self.stepdir)# * stepsize
        #+ self.stepdir)# * stepsize
        for i, v in enumerate(self.net_params):
            try:
                for k in range(len(v)):
                    self.net_params[i][k] += self.stepdir[i][k] * self.net_params[i][k]
            except:
                self.net_params[i] += self.stepdir[i] * self.net_params[i]
     

In [0]:
import tensorflow as tf
import numpy as np

ac_dim = 1

class Critic_Net():
  def __init__(self, num_actions, action_dim, name, action_bound, state_dim, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.name = name
        self.action_bound = action_bound
        self.action_dim = action_dim
        self.state_dim = state_dim
        self._build_model(num_actions)

  def _build_model(self, num_actions):

        self.action = tf.placeholder(dtype=tf.float32, shape=[None, self.action_dim])
        self.inp = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float32)

        self.inp_act = tf.concat([self.inp, self.action], 1)

        self.inpW = tf.Variable(tf.random_uniform([self.state_dim +  self.action_dim, 16], -0.5, 0.5))
        self.inpB = tf.Variable(tf.constant(0.1, shape=[16]))
        self.h1 = tf.nn.relu(tf.matmul(self.inp_act, self.inpW) + self.inpB)

        self.h2W = tf.Variable(tf.random_uniform([16, 32], -0.5, 0.5))
        self.h2B = tf.Variable(tf.constant(0.1, shape=[32]))
        self.h2 = tf.nn.relu(tf.matmul(self.h1, self.h2W) + self.h2B)

        self.h3W = tf.Variable(tf.random_uniform([32, 16], -0.5, 0.5))
        self.h3B = tf.Variable(tf.constant(0.1, shape=[16]))
        self.h3 = tf.nn.relu(tf.matmul(self.h2, self.h3W) + self.h3B)

        self.h4W = tf.Variable(tf.random_uniform([16, self.action_dim], -0.5, 0.5))

        self.outB = tf.Variable(tf.constant(0.01, shape=[self.action_dim]))
        self.outputs = tf.nn.relu(tf.matmul(self.h3, self.h4W) + self.outB)

        # self.out = np.rint(self.out) # round to 0 or 1 as out
        self.y_ = tf.placeholder(shape=[None, self.action_dim], dtype=tf.float32)

        #        Q_grad = K.gradients(Q_pred, actions)

        self.trainer = tf.train.AdamOptimizer(self.learning_rate)
        self.loss = tf.reduce_mean(tf.squared_difference(self.outputs, self.y_))

        self.step = self.trainer.minimize(self.loss)

        self.action_grads = tf.gradients(self.outputs, self.action)

        self.saver = tf.train.Saver()
  def predict(self, sess, states, actions):
        """
        Args:
          sess: TensorFlow session
          states: array of states for which we want to predict the actions.
        Returns:
          The prediction of the output tensor.
        """

        states = np.atleast_2d(states)
        states = np.reshape(states, [len(states), 3])

        feed = {self.inp: states, self.action: actions}
        prediction = sess.run(self.outputs, feed)

        return prediction

  def update(self, sess, states, actions, targets, summary):
        """
        Updates the weights of the neural network, based on its targets, its
        predictions, its loss and its optimizer.

        Args:
          sess: TensorFlow session.
          states: [current_state] or states of batch
          actions: [current_action] or actions of batch
          targets: [current_target] or targets of batch
        """

        #pred = self.predict(sess, states, actions)


        states = np.atleast_2d(states)
        states = np.reshape(states, [len(states), 3])
        return sess.run(self.loss, feed_dict={self.inp: states,self.action: actions, self.y_: targets})


  def action_gradients(self, sess, states, actions):
        return sess.run(self.action_grads, feed_dict={
            self.inp: states,
            self.action: actions})


class Critic_Target_Network(Critic_Net):
    """
    Slowly updated target network. Tau indicates the speed of adjustment. If 1,
    it is always set to the values of its associate.
    """

    def __init__(self, num_actions, action_dim, name, action_bound, state_dim, learning_rate=0.001, tau=0.001):
        super().__init__(num_actions, action_dim, name, action_bound, state_dim, learning_rate)
        self.tau = tau
        self._associate = self._register_associate()

    def _register_associate(self):

        critic_vars =tf.trainable_variables("critic")#"critic"
        target_vars =tf.trainable_variables("critic_target")#"critic_target"

        op_holder = []
        for idx, var in enumerate(target_vars):  # // is to retun un integer
            op_holder.append(var.assign(
                (critic_vars[idx].value() * self.tau) + ((1 - self.tau) * var.value())))
        #return target_vars.assign((critic_vars * self.tau )+((1 - self.tau) * target_vars))
        return op_holder

    def update(self, sess):
        for op in self._associate:
            sess.run(op)


In [0]:
import gym
from gym import spaces
from gym.utils import seeding

from os import path
from PIL import Image


class PendulumEnv(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 30
    }

    def __init__(self, reward_function=None):
        self.max_speed = 8
        self.max_torque = 2.
        self.dt = .05
        self.viewer = None

        high = np.array([1., 1., self.max_speed])
        self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,))
        self.observation_space = spaces.Box(low=-high, high=high)

        self._seed()

        if reward_function is None:
            def reward(pendulum):
                return 1 if -0.1 <= angle_normalize(pendulum.state[0]) <= 0.1 else 0

            self.reward = reward
        else:
            self.reward = reward_function

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, u):
        th, thdot = self.state

        g = 10.
        m = 1.
        l = 1.
        dt = self.dt

        u = np.clip(u, -self.max_torque, self.max_torque)[0]

        self.last_u = u
        newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt
        newth = th + newthdot * dt
        newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
        reward = self.reward(self)

        self.state = np.array([newth, newthdot])

        return self._get_obs(), reward, False, {}

    def reset(self):
        high = np.array([np.pi, 1])
        self.state = self.np_random.uniform(low=-high, high=high)
        self.last_u = None
        return self._get_obs()

    def _get_obs(self):
        theta, thetadot = self.state
        return np.array([np.cos(theta), np.sin(theta), thetadot])

    def render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(500, 500)
            self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)
            rod = rendering.make_capsule(1, .2)
            rod.set_color(.8, .3, .3)
            self.pole_transform = rendering.Transform()
            rod.add_attr(self.pole_transform)
            self.viewer.add_geom(rod)
            axle = rendering.make_circle(.05)
            axle.set_color(0, 0, 0)
            self.viewer.add_geom(axle)
            fname = path.join(path.dirname(__file__), "assets/clockwise.png")
            self.img = rendering.Image(fname, 1., 1.)
            self.imgtrans = rendering.Transform()
            self.img.add_attr(self.imgtrans)

        self.viewer.add_onetime(self.img)
        self.pole_transform.set_rotation(self.state[0] + np.pi / 2)
        if self.last_u:
            self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2)

        return self.viewer.render(return_rgb_array=mode == 'rgb_array')


def angle_normalize(x):
    if np.isnan(x):
        print("ISNOTNUMBER")
    val = (((x + np.pi) % (2 * np.pi)) - np.pi)
    return val


In [0]:
import sys


if __name__ == "__main__":
  
    print("start")
    env = PendulumEnv()
    action_space = np.arange(-2, 2.01, 0.01)
    num_actions = len(action_space)
    action_dim = 1
    action_bound = env.action_space.high
    state_dim = 3
    batch_size = 32
    learning_rate = 0.001
    delta = 0.01
    discount_factor = 0.99
    num_episodes = 500
    len_episode = 100
    epsilon = 0.1
    load = False
    if not load:

        g_stat = []

        config = tf.ConfigProto()
        
        config.gpu_options.allow_growth = True

        with tf.Session(config=config)as sess:

            with tf.name_scope("actor"):
                actor = Actor_Net(num_actions, action_dim, "actor", action_bound, state_dim,
                                  learning_rate=learning_rate)

            with tf.name_scope("critic"):
                critic = Critic_Net(num_actions, action_dim, "critic", action_bound, state_dim,
                                    learning_rate=learning_rate)


            writer = tf.summary.FileWriter('./TRPO/TRPO_loss', sess.graph)
            #summ_critic_loss = tf.summary.scalar('loss_critic', critic.get_loss())

            sess.run(tf.global_variables_initializer())
            g = sess.graph
            """
            Trpo
            """
            loss_episodes = []
            stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes))
            buffer = ReplayBuffer()
            nTimes_actions = np.ones(num_actions)

            for i_episode in range(num_episodes):
                loss = []
                # Also print reward for last episode
                last_reward = stats.episode_rewards[i_episode - 1]
                print("\rEpisode {}/{} ({})".format(i_episode + 1, num_episodes, last_reward), end="")
                sys.stdout.flush()

                done = False
                i = 0
                g_r = 0

                observation = env.reset()

                while not done and i < len_episode:

                    first = True
                    if i != 0:
                        first = False
                        sess.graph.clear_collection("theta_sff")
                    loss = []
                    i += 1
                    old_observation = observation
                    action = np.take(actor.predict(sess, observation), [0])

                   # env.render()
                    observation, reward, done, info = env.step([action])

                    buffer.add_transition(old_observation, action, observation, reward, done)
                    s, a, ns, r, d = buffer.next_batch(batch_size)

                    pred_actions = actor.predict(sess, ns)

                    q_values = critic.predict(sess, ns, pred_actions)

                    r = np.reshape(r,[-1,1])
                    y = q_values - r

                    g_r += reward
                    g_stat.append(int(np.round(g_r)))

                    loss_critic = critic.update(sess, s, a, y, None)

                    loss.append(loss_critic)

                    sys.stdout.flush()

                    actor.update(sess, s, a, y, None, first)

                    stats.episode_rewards[i_episode] += reward

                    g_stat.append(int(np.round(g_r)))

                    #sess.graph.as_default()

                l = sum(loss)
                summ_critic_loss = tf.Summary(value=[tf.Summary.Value(tag="loss_critic",
                                                                      simple_value=l)])
                writer.add_summary(summ_critic_loss, i_episode)

                writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag="episode_rewards",
                                                                     simple_value=stats.episode_rewards[i_episode])]), i_episode)


                writer.flush()
                loss_episodes.append(l)

                stats.episode_lengths[i_episode] = i

                gc.collect()
                #tf.reset_default_graph()

                #tf.get_default_graph().finalize()
            plot_episode_stats(stats)
            plot_stats(loss_episodes)
            # return stats, loss_episodes


start
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode 1/500 (0.0)

