# RL Agent Test
---

## Imports

In [1]:
from math import pi
import numpy as np
from World import *
from Agent import *

import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.layers import *
from tensorflow.keras import Model
print(f'TensorFlow version: {tf.__version__}')

2024-04-01 18:52:49.192763: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-01 18:52:49.213336: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.15.0


## Instantiate Neural Networks for Policy and Q

So for now I am using pretty uninformed choices for neural network
architecture just to get this running asap, but we might want to 
keep the networks small even when we do this for real.

In [2]:
class Policy(Model):
    min_action = -pi
    max_action = pi
    
    def __init__(self):
        super().__init__()
        self.dense1 = Dense(16, activation='relu', input_shape=(12,))
        self.dense2 = Dense(8, activation='relu')
        self.dropout = Dropout(0.1)
        self.mu = Dense(1, activation='softplus')
        self.sigma = Dense(1)    

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dropout(x)
        mu = self.mu(x)
        sigma = self.sigma(x)
        normal = tfp.distributions.Normal(mu, sigma)
        sample = tf.squeeze(normal.sample(1), axis=0)
        sample = tf.clip_by_value(sample, Policy.min_action, Policy.max_action)
        return normal, sample

# Create an instance of the model
policy = Policy()
print(policy(np.expand_dims(np.ones(12), axis=0)))
policy.summary()

Q = tf.keras.models.Sequential([
  tf.keras.layers.Dense(16, activation='relu', input_shape=(13,)),
  tf.keras.layers.Dense(8, activation='relu'),
  tf.keras.layers.Dropout(0.1),
  tf.keras.layers.Dense(1)
], name='Q(s,a)')
Q.summary()

(<tfp.distributions.Normal 'Normal' batch_shape=[1, 1] event_shape=[] dtype=float32>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[2.254279]], dtype=float32)>)
Model: "policy"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               multiple                  208       
                                                                 
 dense_1 (Dense)             multiple                  136       
                                                                 
 dropout (Dropout)           multiple                  0         
                                                                 
 dense_2 (Dense)             multiple                  9         
                                                                 
 dense_3 (Dense)             multiple                  9         
                                                                 
Total params: 362 (1.4

## Instantiate World

In [3]:
world = SimpleTestWorld()

## Instantiate Agent

In [4]:
agent = Agent(world, policy, Q)

## Train Agent

In [5]:
%reload_ext autoreload
%autoreload 2

from Agent import *


expand_policy = lambda t: (t[0], t[1], t[1].numpy()[0][0])
apply_policy = lambda x: expand_policy(policy(np.expand_dims(x, axis=0)))

s = (1,2,3,4,5,6,7,8,9,10,11,12)
preds = apply_policy(s)
print(preds)

agent.training_step(20)

preds = apply_policy(s)
print(preds)

(<tfp.distributions.Normal 'Normal' batch_shape=[1, 1] event_shape=[] dtype=float32>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[3.1415927]], dtype=float32)>, 3.1415927)
(<tfp.distributions.Normal 'Normal' batch_shape=[1, 1] event_shape=[] dtype=float32>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[3.1415927]], dtype=float32)>, 3.1415927)


In [6]:
agent.train(20, 1000, 20)

Output()

Output()

epoch 0: Mean Reward = 0.47833556257834475


Output()

epoch 1: Mean Reward = 0.4621175091908298


Output()

epoch 2: Mean Reward = 0.4430592390285617


Output()

epoch 3: Mean Reward = 0.4773394758636126


Output()

epoch 4: Mean Reward = 0.442131214655378


Output()

epoch 5: Mean Reward = 0.4389284500754632


Output()

epoch 6: Mean Reward = 0.42782327275568466


KeyboardInterrupt: 