# RL Agent Test
---

## Imports

In [1]:
from math import pi
import numpy as np
from World import *
from Agent import *

import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.layers import *
from tensorflow.keras import Model
print(f'TensorFlow version: {tf.__version__}')

2024-04-17 00:38:19.782332: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-17 00:38:19.801471: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.15.0


## Instantiate Neural Networks for Policy and Q

So for now I am using pretty uninformed choices for neural network
architecture just to get this running asap, but we might want to 
keep the networks small even when we do this for real.

In [2]:
class Policy(Model):
    min_action = -pi
    max_action = pi
    
    def __init__(self):
        super().__init__()
        self.dense1 = Dense(64, activation='relu', input_shape=(12,))
        self.dense2 = Dense(32, activation='relu')
        self.dense3 = Dense(16, activation='sigmoid')
        self.dropout = Dropout(0.1)
        self.mu = Dense(1)
        self.sigma = Dense(1, activation='softplus')    

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.dropout(x)
        mu = self.mu(x)
        sigma = self.sigma(x)
        return mu, sigma 

# Create an instance of the model
policy = Policy()
inp = np.random.random((12,12))
print(f'input: \n{inp}\n')
out = policy(inp)
print(f'out: \n{out}\n')

dists = tfp.distributions.Normal(*out)
print(f'dists: \n:{dists}\n')

samples = dists.sample()
print(f'samples: \n{samples}\n')

print(f'log probs: {dists.log_prob(samples.numpy())}\n')

policy.summary()

Q = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, activation='relu', input_shape=(13,)),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dense(16, activation='sigmoid'),
  tf.keras.layers.Dropout(0.1),
  tf.keras.layers.Dense(1)
], name='Q(s,a)')
Q.summary()

input: 
[[0.36129589 0.21196218 0.09800635 0.62517637 0.9061955  0.74316705
  0.99695096 0.64575369 0.78209964 0.63553303 0.61627736 0.74154376]
 [0.56070461 0.22728301 0.52226786 0.32516431 0.28811749 0.92944179
  0.55452018 0.44609479 0.27453395 0.41676156 0.34825226 0.47948878]
 [0.94257536 0.92231718 0.19043734 0.86160115 0.95783438 0.44966425
  0.09691994 0.63922375 0.52316104 0.05382008 0.2527328  0.88431183]
 [0.87339255 0.56726674 0.06242077 0.97101095 0.41687611 0.22923319
  0.09300735 0.86579889 0.82354029 0.28388687 0.90975726 0.92125685]
 [0.28235728 0.39066901 0.36340597 0.11749306 0.58462195 0.919555
  0.55846282 0.31709875 0.01086222 0.07128453 0.72817997 0.12258632]
 [0.89355771 0.34787875 0.00756957 0.65466996 0.45415302 0.79846217
  0.29583362 0.75044401 0.82702918 0.5829731  0.48103281 0.99199877]
 [0.81779105 0.5404805  0.48934541 0.93154911 0.4282745  0.40616205
  0.70587333 0.16778909 0.64104378 0.92300443 0.85854886 0.00314637]
 [0.94814068 0.80837778 0.81744709 

In [3]:
x = np.random.random((10,3))
a = tf.ones((1,3))
with tf.GradientTape() as tape:
    tape.watch(a)
    y = a * x
print(y)
grad = tape.gradient(y, a)
print(f'sum grad: {tf.reduce_sum(x, axis=0)}')
print(f'len: {len(grad)} \n {grad}')

tf.Tensor(
[[0.3780488  0.30356765 0.02631802]
 [0.03748018 0.11745517 0.46502805]
 [0.36419058 0.5761398  0.07235633]
 [0.24416576 0.9348069  0.25601307]
 [0.26073918 0.2119578  0.8737608 ]
 [0.3732553  0.7675999  0.3952509 ]
 [0.2583051  0.47000706 0.5914942 ]
 [0.10158319 0.6448231  0.04319067]
 [0.07854597 0.620553   0.7146234 ]
 [0.28416157 0.17866756 0.76604205]], shape=(10, 3), dtype=float32)
sum grad: [2.38047564 4.82557787 4.20407754]
len: 1 
 [[2.3804758 4.8255777 4.2040777]]


## Instantiate World

In [4]:
world = SimpleTestWorld()

## Instantiate Agent

In [5]:
agent = AsyncAgent(world, policy, Q, learning_rate_policy=0.00001, learning_rate_Q=0.00056)

## Train Agent

In [7]:
%reload_ext autoreload
%autoreload 2

from Agent import *

s = (1,2,3,4,5,6,7,8,9,10,11,12)

preds = agent._sample_policy(s)
print(preds)

agent.train(100, 5, 1)

preds = agent._sample_policy(s)
print(preds)

0.72524583
Task 0 Complete!
4
Task 1 Complete!
3
Task 2 Complete!
2
Task 3 Complete!
1
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
NOT ACTIVE
Task 4 Complete!
0
JOINED
JOINED
JOINED
JOINED
JOINED
mean reward: -20725.713456625755
mean reward: -23140.763411558943
mean reward: -25693.145370801783
mean reward: -20441.536392841368
mean reward: -22336.293012437243
0.83673644


In [None]:
r_mean, states = agent.training_step(100)
# print(states)
states = zip(*(state[:3] for state in states))
states = list(states)[0]
# print(states)

plt.plot([x for x in range(100)], states)

In [12]:
agent.train(100, 5, 100)

JOINED
JOINED
JOINED
JOINED
JOINED
mean reward: -27090.16821320598
mean reward: -43452.63353015997
mean reward: -26447.28121447016
mean reward: -39456.08928110679
mean reward: -31332.708278650123
