##### Para abrir en colab:

https://colab.research.google.com/drive/1sSu2kvUld4njCbIfFwV9SmqDSHQnOSk3?usp=sharing

In [1]:
# Realizamos los imports necesarios
import numpy as np
%tensorflow_version 2.x
import tensorflow as tf
import matplotlib, cv2
import matplotlib.pyplot as plt
import base64, io, os, time, gym
import IPython, functools
import time
from tqdm import tqdm
import tensorflow_probability as tfp

## Definimos la red como ambiente

In [None]:
class pn_environ:
    
    def __init__(self,init_state,action_space,W_pre,W_post):
        # Estados
        self.state = init_state
        self.init_state = init_state
        # Transiciones
        self.action_space = action_space
        self.T_mapped = {T:n for n,T in zip(range(len(self.action_space)), self.action_space)}
        # Matrices de incidencia
        self.W_pre = W_pre
        self.W_post= W_post
        self.W    = self.W_post - self.W_pre
        # Actualizamos la sensibilizacion por marcado
        self.T_shooteable_by_marked = [False for i in action_space]
        self.update_T_by_marks()
        # Sobre invariantes
        self.n_I1 = 1
        self.n_I2 = 1

    def update_T_by_marks(self):
        for i in range(len(self.action_space)):
          action_idx = i
          marks_after_W_pre = self.state - self.W_pre[:,action_idx]
          self.T_shooteable_by_marked[action_idx] = False if any(n_tok < 0 for n_tok in marks_after_W_pre) else True

    def map_T(self):
        self.T_mapped = {T:n for n, T in self.action_space}

    def can_shoot(self,action):
        return self.T_shooteable_by_marked[action]

    def shoot_pn(self,action):
        self.state = self.state + self.W[:,action]
        self.update_T_by_marks()
        return self.state

    def step(self,action):
        if self.can_shoot(action) is False:
            self.shoot_pn(action)
            reward = -100.0
            done   = True
            info   = None
            state  = tf.constant(self.state)
        else:
            self.shoot_pn(action)
            state = tf.constant(self.state)
            reward = 2 + self.reward_by_resourse_used(action) + self.reward_by_invariant_completed(action)  
            done   = False
            info   = None
        #print("Action was: ",action)
        #print("Reward was: ",reward)
        return state, reward, done, info 

    def reset(self):
        self.state = self.init_state
        self.update_T_by_marks()
        self.n_I1 = 1
        self.n_I2 = 1
        return tf.constant(self.state) 

    def set(self, state, n_I1=1,n_I2=1):
        self.state = state
        self.update_T_by_marks()
        self.n_I1 = n_I1
        self.n_I2 = n_I2

    def reward_by_resourse_used(self, action):
        reward = 0
        if action == 0 or action == 3:
            reward += 5  
        return reward
    
    def reward_by_invariant_completed(self, action):
        reward = 0
        if action == 2 or action == 5:
          self.n_I1 += 1 if action == 2 else 0
          self.n_I2 += 1 if action == 5 else 0
          reward += 20

          if self.n_I1 == self.n_I2:
            reward += 100
        
        return reward

##### Red simple con mutex

In [None]:
# Ejecutamos para red simple con mutex
action_space = ["T0","T1","T2","T3","T4","T5"]
init_state   = [1,0,0,1,0,0,1]
W_pre = np.array([[1,0,0,0,0,0],
                  [0,1,0,0,0,0],
                  [0,0,1,0,0,0],
                  [0,0,0,1,0,0],
                  [0,0,0,0,1,0],
                  [0,0,0,0,0,1],
                  [1,0,0,1,0,0]])

W_post= np.array([[0,0,1,0,0,0],
                  [1,0,0,0,0,0],
                  [0,1,0,0,0,0],
                  [0,0,0,0,0,1],
                  [0,0,0,1,0,0],
                  [0,0,0,0,1,0],
                  [0,1,0,0,1,0]])

### Definimos la memoria del agente

In [None]:
class Memory:
    def __init__(self):
        self.clear()

    #Limpia la memoria
    def clear(self):
        self.observations = []
        self.actions      = []
        self.rewards      = []

    # Añadimos una nueva observacion, accion y recompensa a la memoria
    def add_to_memory(self,new_observation,new_action,new_reward):
        self.observations.append(new_observation)
        self.actions.append(new_action)
        self.rewards.append(new_reward)

## Empezamos el entrenamiento

In [None]:
# Definimos los parametros de aprendizaje

learning_rate = 0.001
optimizer = tf.keras.optimizers.Adam(learning_rate)

n_episodes_of_training = 20000
n_max_steps = 120
total_rewards = []
environ = pn_environ(init_state,action_space,W_pre,W_post)
n_tests=0

memory = Memory()
n_actions = len(environ.action_space)

##### Definimos el cubo (tensor)

In [None]:
cubo = tf.zeros([n_actions, n_actions, n_actions])
cubo = cubo.numpy()


##### Entrenamiento

In [None]:
# Pasamos el cubo para ver recompensas
    
for i_step in range(n_actions):
    i_reward = 0
    observation = environ.reset()
    action = i_step
    i_new_observation, i_reward, i_done, info = environ.step(action)
      
    for j_step in range(n_actions):
      j_reward = 0
      if i_done is False:
        # Actualizamos el estado
        environ.set(i_new_observation)
        
        action = j_step
        j_new_observation, j_reward, j_done, info = environ.step(action)
          
      for k_step in range(n_actions):
        k_reward = 0
        if i_done is False and j_done is False:  
          # Actualizamos el estado
          environ.set(j_new_observation)

          action = k_step
          k_new_observation, k_reward, done, info = environ.step(action)
        
        cubo[i_step][j_step][k_step] = i_reward + j_reward + k_reward

### Imprimimos los resultados

In [None]:
max = np.max(cubo)
paths = np.where(cubo == max)

print("Mayor resultado: ", max)
print("Siguiendo los caminos: ")

string = "\t\t\t"
for i in range(len(paths[0])):
  for j in range(len(paths)):
    string+=str(paths[j][i])
    string+=' - ' if j != len(paths)-1 else "\n\t\t\t"

print(string)

Mayor resultado:  31.0
Siguiendo los caminos: 
			0 - 1 - 2
			3 - 4 - 5
			


In [None]:
print(cubo)

[[[ -93.  -93.  -93.  -93.  -93.  -93.]
  [ -91.  -91.   31.   16.  -91.  -91.]
  [ -93.  -93.  -93.  -93.  -93.  -93.]
  [ -93.  -93.  -93.  -93.  -93.  -93.]
  [ -93.  -93.  -93.  -93.  -93.  -93.]
  [ -93.  -93.  -93.  -93.  -93.  -93.]]

 [[-100. -100. -100. -100. -100. -100.]
  [-100. -100. -100. -100. -100. -100.]
  [-100. -100. -100. -100. -100. -100.]
  [-100. -100. -100. -100. -100. -100.]
  [-100. -100. -100. -100. -100. -100.]
  [-100. -100. -100. -100. -100. -100.]]

 [[-100. -100. -100. -100. -100. -100.]
  [-100. -100. -100. -100. -100. -100.]
  [-100. -100. -100. -100. -100. -100.]
  [-100. -100. -100. -100. -100. -100.]
  [-100. -100. -100. -100. -100. -100.]
  [-100. -100. -100. -100. -100. -100.]]

 [[ -93.  -93.  -93.  -93.  -93.  -93.]
  [ -93.  -93.  -93.  -93.  -93.  -93.]
  [ -93.  -93.  -93.  -93.  -93.  -93.]
  [ -93.  -93.  -93.  -93.  -93.  -93.]
  [  16.  -91.  -91.  -91.  -91.   31.]
  [ -93.  -93.  -93.  -93.  -93.  -93.]]

 [[-100. -100. -100. -100. -100.