In [None]:
import os
import random
import gym
import pylab
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import cv2
import threading
from threading import Thread, Lock
import time
import tensorflow_probability as tfp
from typing import Any, List, Sequence, Tuple
from dodgeCreep import dodgeCreepEnv

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'


#gpus = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_virtual_device_configuration(gpus[0],
#            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])

tfd = tfp.distributions


def scaled_dot_product_attention(q, k, v, mask):
  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights
  
    
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights


class OurModel(tf.keras.Model):
    def __init__(self, input_shape, action_space):
        super(OurModel, self).__init__()
        
        self.flatten = Flatten()
        
        self.dense_a = Dense(1024, activation='relu')
        self.dense_b = Dense(1024, activation='relu')
        self.dense_c = Dense(1024, activation='relu')
        #self.dense_c = Dense(1024, activation='relu')
        #self.dense_c = Dense(1024, activation='relu')
        #self.conv_1 = Conv2D(32, 3, 2, padding="valid", activation="relu")
        #self.conv_2 = Conv2D(64, 3, 2, padding="valid", activation="relu")
        #self.conv_3 = Conv2D(63, 3, 2, padding="valid", activation="relu")
        
        '''
        self.attention_1 = MultiHeadAttention(64, 4)
        self.layernorm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout_1 = tf.keras.layers.Dropout(0.1)
        
        self._conv_out_size = 9
        self._locs = []
        for i in range(0, self._conv_out_size*self._conv_out_size):
            self._locs.append(i / float(self._conv_out_size*self._conv_out_size))

        self._locs = tf.expand_dims(self._locs, 0)
        self._locs = tf.expand_dims(self._locs, 2)
        '''
            
        self.dense_1 = Dense(action_space)
        self.dense_2 = Dense(1)
        
    def call(self, X_input, training):
        batch_size = tf.shape(X_input)[0]
        
        #print("X_input.shape: ", X_input.shape)
        X_input = self.flatten(X_input)
        dense_a = self.dense_a(X_input)
        X_output = self.dense_b(dense_a)
        #X_output = self.dense_c(dense_b)
        #X_input = tf.transpose(X_input, perm=[0, 2, 3, 1])
        
        #conv_1 = self.conv_1(X_input)
        #conv_2 = self.conv_2(conv_1)
        #conv_3 = self.conv_3(conv_2)
        #print("conv_3.shape: ", conv_3.shape)
        
        '''
        conv_3_features = tf.reshape(conv_3, [batch_size,self._conv_out_size*self._conv_out_size,63])
        
        locs = tf.tile(self._locs, [batch_size, 1, 1])
        conv_3_features_locs = tf.concat([conv_3_features, locs], 2)

        attention_output_1, _ = self.attention_1(conv_3_features_locs, conv_3_features_locs, conv_3_features_locs, None)
        attention_output_1 = self.dropout_1(attention_output_1, training=training)
        attention_output_1 = self.layernorm_1(conv_3_features_locs + attention_output_1)

        attention_max_pool_1d = tf.math.reduce_max(attention_output_1, 1)
        
        attention_flattened = Flatten()(attention_max_pool_1d)
        
        attention_flattened = Flatten()(dense_0)
        '''
        
        action_logit = self.dense_1(X_output)
        value = self.dense_2(X_output)
        
        return action_logit, value


def safe_log(x):
  """Computes a safe logarithm which returns 0 if x is zero."""
  return tf.where(
      tf.math.equal(x, 0),
      tf.zeros_like(x),
      tf.math.log(tf.math.maximum(1e-12, x)))


def take_vector_elements(vectors, indices):
    """
    For a batch of vectors, take a single vector component
    out of each vector.
    Args:
      vectors: a [batch x dims] Tensor.
      indices: an int32 Tensor with `batch` entries.
    Returns:
      A Tensor with `batch` entries, one for each vector.
    """
    return tf.gather_nd(vectors, tf.stack([tf.range(tf.shape(vectors)[0]), indices], axis=1))


huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
sparse_ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
mse_loss = tf.keras.losses.MeanSquaredError()


class A3CAgent:
    # Actor-Critic Main Optimization Algorithm
    def __init__(self, env_name):
        # Initialization
        self.env_name = env_name       
        GODOT_BIN_PATH = "dodge_the_creeps/DodgeCreep.x86_64"
        env_abs_path = "dodge_the_creeps/DodgeCreep.pck"
        self.env = dodgeCreepEnv(exec_path=GODOT_BIN_PATH, env_path=env_abs_path, turbo_mode=True)
        
        self.action_size = 5
        self.EPISODES, self.episode, self.max_average = 2000000, 0, 50.0 # specific for pong
        self.lock = Lock()
        self.lr = 0.0001

        self.ROWS = 80
        self.COLS = 80
        self.REM_STEP = 4

        # Instantiate plot memory
        self.scores, self.episodes, self.average = [], [], []

        self.Save_Path = 'Models'
        self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
        
        if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
        self.path = '{}_A3C_{}'.format(self.env_name, self.lr)
        self.model_name = os.path.join(self.Save_Path, self.path)

        # Create Actor-Critic network model
        self.ActorCritic = OurModel(input_shape=self.state_size, action_space=self.action_size)
        
        self.optimizer = tf.keras.optimizers.Adam(self.lr)

    def act(self, state):
        # Use the network to predict the next action to take, using the model
        prediction = self.ActorCritic(state, training=False)
        action = tf.random.categorical(prediction[0], 1).numpy()

        return action[0][0]

    def discount_rewards(self, reward, dones):
        # Compute the gamma-discounted rewards over an episode
        gamma = 0.99    # discount rate
        running_add = 0
        discounted_r = np.zeros_like(reward)
        for i in reversed(range(0, len(reward))):
            if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
                running_add = 0

            running_add = running_add * gamma + reward[i]
            discounted_r[i] = running_add

        if np.std(discounted_r) != 0:
            discounted_r -= np.mean(discounted_r) # normalizing the result
            discounted_r /= np.std(discounted_r) # divide by standard deviation

        return discounted_r
        
    def replay(self, states, actions, rewards, dones):
        # reshape memory to appropriate shape for training
        states = np.vstack(states)
        
        # Compute discounted rewards
        discounted_r = self.discount_rewards(rewards, dones)
        discounted_r_ = np.vstack(discounted_r)
        with tf.GradientTape() as tape:
            prediction = self.ActorCritic(states, training=True)
            action_logits = prediction[0]
            values = prediction[1]
            
            action_logits_selected = take_vector_elements(action_logits, actions)
            
            advantages = discounted_r - np.stack(values)[:, 0] 
            
            action_logits_selected = tf.nn.softmax(action_logits_selected)
            action_logits_selected_probs = tf.math.log(action_logits_selected)
            
            action_logits_ = tf.nn.softmax(action_logits)
            dist = tfd.Categorical(probs=action_logits_)
            action_log_prob = dist.prob(actions)
            action_log_prob = tf.math.log(action_log_prob)
            
            actor_loss = -tf.math.reduce_mean(action_logits_selected_probs * advantages) 
            
            action_probs = tf.nn.softmax(action_logits)
            
            #critic_loss = huber_loss(values, discounted_r)
            critic_loss = mse_loss(values, discounted_r)
            critic_loss = tf.cast(critic_loss, 'float32')
            #print("critic_loss: ", critic_loss)
            
            entropy_loss = -tf.math.reduce_mean(action_logits_selected * tf.math.log(action_logits_selected))
            
            total_loss = actor_loss + critic_loss
        
        grads = tape.gradient(total_loss, self.ActorCritic.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.ActorCritic.trainable_variables))
        
    def load(self, model_name):
        self.ActorCritic = load_model(model_name, compile=False)
        #self.Critic = load_model(Critic_name, compile=False)

    def save(self):
        self.ActorCritic.save(self.model_name)
        #self.Critic.save(self.Model_name + '_Critic.h5')

    pylab.figure(figsize=(18, 9))
    def PlotModel(self, score, episode):
        self.scores.append(score)
        self.episodes.append(episode)
        self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
        if str(episode)[-2:] == "00":# much faster than episode % 100
            pylab.plot(self.episodes, self.scores, 'b')
            pylab.plot(self.episodes, self.average, 'r')
            pylab.ylabel('Score', fontsize=18)
            pylab.xlabel('Steps', fontsize=18)
            try:
                pylab.savefig(self.path + ".png")
            except OSError:
                pass

        return self.average[-1]
    
    def imshow(self, image, rem_step=0):
        cv2.imshow(self.model_name + str(rem_step), image[rem_step,...])
        if cv2.waitKey(25) & 0xFF == ord("q"):
            cv2.destroyAllWindows()
            return

    def GetImage(self, frame, image_memory):
        if image_memory.shape == (1,*self.state_size):
            image_memory = np.squeeze(image_memory)
        
        # croping frame to 80x80 size
        frame_cropped = frame[35:195:2, ::2,:]
        if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
            # OpenCV resize function 
            frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
        
        #print("frame_cropped.shape: ", frame_cropped.shape)
        
        # converting to RGB (numpy way)
        frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
        #frame_rgb = frame_cropped
        
        #print("frame_rgb: ", frame_rgb)
        
        # convert everything to black and white (agent will train faster)
        frame_rgb[frame_rgb < 100] = 0
        frame_rgb[frame_rgb >= 150] = 255
        
        # dividing by 255 we expresses value to 0-1 representation
        new_frame = np.array(frame_rgb).astype(np.float32) / 255.0

        cv2.imshow("new_frame: ", new_frame)
        if cv2.waitKey(25) & 0xFF == ord("q"):
            cv2.destroyAllWindows()
        
        # push our data by 1 frame, similar as deq() function work
        image_memory = np.roll(image_memory, 1, axis=0)

        # inserting new frame to free space
        image_memory[0,:,:] = new_frame

        return np.expand_dims(image_memory, axis=0)

    def reset(self, env):
        image_memory = np.zeros(self.state_size)
        frame = env.reset()
        frame = np.reshape(frame, (128,128,3))
        frame = frame.astype(np.uint8)
        #frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        for i in range(self.REM_STEP):
            state = self.GetImage(frame, image_memory)
            
        return state
    
    def step(self, action, env, image_memory):
        next_frame, reward, done, info = env.step(action)
        next_frame = np.reshape(next_frame, (128,128,3))
        next_frame = next_frame.astype(np.uint8)
        #next_frame = cv2.cvtColor(next_frame, cv2.COLOR_BGR2RGB)
        
        next_state = self.GetImage(next_frame, image_memory)
        
        return next_state, reward, done, info
    
    def train(self, n_threads):
        self.env.close()
        # Instantiate one environment per thread
        GODOT_BIN_PATH = "dodge_the_creeps/DodgeCreep.x86_64"
        env_abs_path = "dodge_the_creeps/DodgeCreep.pck"
        envs = [dodgeCreepEnv(exec_path=GODOT_BIN_PATH, env_path=env_abs_path, turbo_mode=True) 
                for i in range(n_threads)]

        # Create threads
        threads = [threading.Thread(target=self.train_threading, daemon=True, 
                                    args=(self, envs[i], i)) for i in range(n_threads)]

        for t in threads:
            time.sleep(1)
            t.start()
            
        for t in threads:
            time.sleep(1)
            t.join()
            
    def train_threading(self, agent, env, thread):
        while self.episode < self.EPISODES:
            # Reset episode
            score, done, SAVING = 0, False, ''
            
            state = self.reset(env)

            states, actions, rewards, dones = [], [], [], []
            while not done:
                action = agent.act(state)
                next_state, reward, done, _ = self.step(action, env, state)
                
                #print("action: ", action)
                #print("next_state.shape: ", next_state.shape)
                
                reward = reward[0]
                done = done[0]
                
                #print("reward: ", reward)
                #print("done: ", done)
                #next_state = np.expand_dims(next_state, axis=0)
                    
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                dones.append(done)

                score += reward
                state = next_state
            
            try:
                self.lock.acquire()
                self.replay(states, actions, rewards, dones)
                self.lock.release()
            except:
                print("except")
                print("states: ", states)
                print("actions: ", actions)
                print("rewards: ", rewards)
                print("dones: ", dones)
                pass
            
            states, actions, rewards, dones = [], [], [], []
                    
            # Update episode count
            with self.lock:
                average = self.PlotModel(score, self.episode)
                # saving best models
                if average >= self.max_average:
                    self.max_average = average
                    #self.save()
                    SAVING = "SAVING"
                else:
                    SAVING = ""

                print("episode: {}/{}, thread: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, thread, score, average, SAVING))
                if(self.episode < self.EPISODES):
                    self.episode += 1

        env.close()            

    def test(self, Actor_name, Critic_name):
        self.load(Actor_name, Critic_name)
        for e in range(100):
            state = self.reset(self.env)
            done = False
            score = 0
            while not done:
                self.env.render()
                action = np.argmax(self.Actor.predict(state))
                state, reward, done, _ = self.step(action, self.env, state)
                score += reward
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
                    break

        self.env.close()


if __name__ == "__main__":
    env_name = 'DodgeCreep'
    agent = A3CAgent(env_name)
    
    #agent.run() # use as A2C
    agent.train(n_threads=1) # use as A3C
    #agent.test('Models/Pong-v0_A3C_2.5e-05_Actor.h5', '')

2022-11-04 03:16:03.390988: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


Created float32 vector agent_action size = 1
Created int32 vector env_action size = 1
Created uint32 vector observation size = 49152
Created float32 vector reward size = 1
Created int32 vector done size = 1
Terminated
Created float32 vector agent_action size = 1
Created int32 vector env_action size = 1
Created uint32 vector observation size = 49152
Created float32 vector reward size = 1
Created int32 vector done size = 1


2022-11-04 03:16:05.403332: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2022-11-04 03:16:05.407097: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-11-04 03:16:05.407139: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: kimbring2-ROG-Strix-GA35DX-G35DX
2022-11-04 03:16:05.407146: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: kimbring2-ROG-Strix-GA35DX-G35DX
2022-11-04 03:16:05.407286: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 515.76.0
2022-11-04 03:16:05.407310: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.76.0
2022-11-04 03:16:05.407315: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 515.76.0
2022-11-04 03:16:0

episode: 0/2000000, thread: 0, score: 2.0, average: 2.00 
episode: 1/2000000, thread: 0, score: 0.0, average: 1.00 
episode: 2/2000000, thread: 0, score: 1.0, average: 1.00 
episode: 3/2000000, thread: 0, score: 1.0, average: 1.00 
episode: 4/2000000, thread: 0, score: 2.0, average: 1.20 
episode: 5/2000000, thread: 0, score: 0.0, average: 1.00 
episode: 6/2000000, thread: 0, score: 2.0, average: 1.14 
