# RL Agent Test
---

## Imports

In [1]:
from math import pi
import numpy as np
from World import *
from Agent import *

import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.layers import *
from tensorflow.keras import Model
print(f'TensorFlow version: {tf.__version__}')

2024-04-01 22:15:36.114405: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-01 22:15:36.134942: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.15.0


## Instantiate Neural Networks for Policy and Q

So for now I am using pretty uninformed choices for neural network
architecture just to get this running asap, but we might want to 
keep the networks small even when we do this for real.

In [2]:
class Policy(Model):
    min_action = -pi
    max_action = pi
    
    def __init__(self):
        super().__init__()
        self.dense1 = Dense(16, activation='relu', input_shape=(12,))
        self.dense2 = Dense(8, activation='relu')
        self.dropout = Dropout(0.1)
        self.mu = Dense(1, activation='softplus')
        self.sigma = Dense(1)    

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dropout(x)
        mu = self.mu(x)
        sigma = self.sigma(x)
        normal = tfp.distributions.Normal(mu, sigma)
        sample = tf.squeeze(normal.sample(1), axis=0)
        sample_clipped = tf.clip_by_value(sample, Policy.min_action, Policy.max_action)
        return normal, sample, sample_clipped

# Create an instance of the model
policy = Policy()
print(policy(np.expand_dims(np.ones(12), axis=0)))
policy.summary()

Q = tf.keras.models.Sequential([
  tf.keras.layers.Dense(16, activation='relu', input_shape=(13,)),
  tf.keras.layers.Dense(8, activation='relu'),
  tf.keras.layers.Dropout(0.1),
  tf.keras.layers.Dense(1)
], name='Q(s,a)')
Q.summary()

(<tfp.distributions.Normal 'Normal' batch_shape=[1, 1] event_shape=[] dtype=float32>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.9202921]], dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.9202921]], dtype=float32)>)
Model: "policy"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               multiple                  208       
                                                                 
 dense_1 (Dense)             multiple                  136       
                                                                 
 dropout (Dropout)           multiple                  0         
                                                                 
 dense_2 (Dense)             multiple                  9         
                                                                 
 dense_3 (Dense)             multiple                  9         
  

## Instantiate World

In [3]:
world = SimpleTestWorld()

## Instantiate Agent

In [4]:
agent = Agent(world, policy, Q)

## Train Agent

In [5]:
%reload_ext autoreload
%autoreload 2

from Agent import *


expand_policy = lambda t: (t[0], t[1], t[1].numpy()[0][0])
apply_policy = lambda x: expand_policy(policy(np.expand_dims(x, axis=0)))

s = (1,2,3,4,5,6,7,8,9,10,11,12)
preds = apply_policy(s)
print(preds)

agent.training_step(20)

preds = apply_policy(s)
print(preds)

(<tfp.distributions.Normal 'Normal' batch_shape=[1, 1] event_shape=[] dtype=float32>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[4.341426]], dtype=float32)>, 4.341426)
(<tfp.distributions.Normal 'Normal' batch_shape=[1, 1] event_shape=[] dtype=float32>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-1.5272835]], dtype=float32)>, -1.5272835)


In [6]:
agent.train(20, 100, 100)

Output()

Output()

epoch 0: Mean Reward = 0.1872727039611038


Output()

epoch 1: Mean Reward = 0.3192500181483407


Output()

epoch 2: Mean Reward = 1.0757513173676163


Output()

epoch 3: Mean Reward = 0.22758664664697523


Output()

epoch 4: Mean Reward = 0.5226149260547138


Output()

epoch 5: Mean Reward = 0.288306919993178


Output()

epoch 6: Mean Reward = 0.3919843766448063


Output()

epoch 7: Mean Reward = 0.24233286646486643


Output()

epoch 8: Mean Reward = 1.1202954668940133


Output()

epoch 9: Mean Reward = 1.6616117353253639


Output()

epoch 10: Mean Reward = 0.23529135006201968


Output()

epoch 11: Mean Reward = 0.20452294280430103


Output()

Output()

epoch 13: Mean Reward = 0.4884358476043552


Output()

epoch 14: Mean Reward = 0.7368961623879791


Output()

epoch 15: Mean Reward = 4.683053083238141


Output()

epoch 16: Mean Reward = 0.2573649844229137


Output()

epoch 17: Mean Reward = 0.3499049509097668


Output()

epoch 18: Mean Reward = 0.36773962790148756


Output()

epoch 19: Mean Reward = 0.8244389297490795


Output()

epoch 20: Mean Reward = 0.45385201285954563


Output()

epoch 21: Mean Reward = 0.23130501522213098


Output()

epoch 22: Mean Reward = 0.33726460299960836


Output()

epoch 23: Mean Reward = 1.2915757592162223


Output()

epoch 24: Mean Reward = 1.553187066599335


Output()

epoch 25: Mean Reward = 12.119276779392806


Output()

epoch 26: Mean Reward = 0.20225066605915815


Output()

epoch 27: Mean Reward = 0.3512906872839078


Output()

epoch 28: Mean Reward = 0.7252776853139399


Output()

epoch 29: Mean Reward = 200.0


Output()

epoch 30: Mean Reward = 200.0


Output()

epoch 31: Mean Reward = 200.0


Output()

epoch 32: Mean Reward = 200.0


Output()

epoch 33: Mean Reward = 200.0


Output()

epoch 34: Mean Reward = 200.0


Output()

epoch 35: Mean Reward = 200.0


Output()

epoch 36: Mean Reward = 200.0


Output()

epoch 37: Mean Reward = 200.0


Output()

epoch 38: Mean Reward = 200.0


Output()

epoch 39: Mean Reward = 200.0


Output()

epoch 40: Mean Reward = 200.0


Output()

epoch 41: Mean Reward = 200.0


Output()

epoch 42: Mean Reward = 200.0


Output()

epoch 43: Mean Reward = 200.0


Output()

epoch 44: Mean Reward = 200.0


Output()

epoch 45: Mean Reward = 200.0


Output()

epoch 46: Mean Reward = 200.0


Output()

epoch 47: Mean Reward = 200.0


Output()

epoch 48: Mean Reward = 200.0


Output()

epoch 49: Mean Reward = 200.0


Output()

epoch 50: Mean Reward = 200.0


Output()

epoch 51: Mean Reward = 200.0


Output()

epoch 52: Mean Reward = 200.0


Output()

epoch 53: Mean Reward = 200.0


Output()

epoch 54: Mean Reward = 200.0


Output()

epoch 55: Mean Reward = 200.0


Output()

epoch 56: Mean Reward = 200.0


Output()

epoch 57: Mean Reward = 200.0


Output()

epoch 58: Mean Reward = 200.0


Output()

epoch 59: Mean Reward = 200.0


Output()

epoch 60: Mean Reward = 200.0


Output()

epoch 61: Mean Reward = 200.0


Output()

epoch 62: Mean Reward = 200.0


Output()

epoch 63: Mean Reward = 200.0


Output()

epoch 64: Mean Reward = 200.0


Output()

epoch 65: Mean Reward = 200.0


Output()

epoch 66: Mean Reward = 200.0


Output()

epoch 67: Mean Reward = 200.0


Output()

epoch 68: Mean Reward = 200.0


Output()

epoch 69: Mean Reward = 200.0


Output()

epoch 70: Mean Reward = 200.0


Output()

epoch 71: Mean Reward = 200.0


Output()

Output()

epoch 73: Mean Reward = 200.0


Output()

epoch 74: Mean Reward = 200.0


Output()

epoch 75: Mean Reward = 200.0


Output()

epoch 76: Mean Reward = 200.0


Output()

epoch 77: Mean Reward = 200.0


Output()

epoch 78: Mean Reward = 200.0


Output()

epoch 79: Mean Reward = 200.0


Output()

epoch 80: Mean Reward = 200.0


Output()

epoch 81: Mean Reward = 200.0


Output()

epoch 82: Mean Reward = 200.0


Output()

epoch 83: Mean Reward = 200.0


Output()

epoch 84: Mean Reward = 200.0


Output()

epoch 85: Mean Reward = 200.0


Output()

epoch 86: Mean Reward = 200.0


Output()

epoch 87: Mean Reward = 200.0


Output()

epoch 88: Mean Reward = 200.0


Output()

epoch 89: Mean Reward = 200.0


Output()

epoch 90: Mean Reward = 200.0


Output()

epoch 91: Mean Reward = 200.0


Output()

epoch 92: Mean Reward = 200.0


Output()

epoch 93: Mean Reward = 200.0


Output()

epoch 94: Mean Reward = 200.0


Output()

epoch 95: Mean Reward = 200.0


Output()

epoch 96: Mean Reward = 200.0


Output()

epoch 97: Mean Reward = 200.0


Output()

epoch 98: Mean Reward = 200.0


epoch 99: Mean Reward = 200.0
