# RL Agent Test
---

## Imports

In [1]:
import os
import sys
current = os.getcwd()
if (os.path.basename(current) == 'rl_agent'):
    top_level_dir = os.path.dirname(os.getcwd())
else:
    top_level_dir = current
sys.path.append(os.path.abspath(top_level_dir))
os.chdir(top_level_dir)

%reload_ext autoreload
%autoreload 2

from math import pi
import numpy as np
from rl_agent.World import *
from rl_agent.Agent import *

import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.layers import *
from tensorflow.keras import Model
print(f'TensorFlow version: {tf.__version__}')

2024-05-28 11:55:12.370879: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-28 11:55:12.483188: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.15.0


## Instantiate Neural Networks for Policy and Q

So for now I am using pretty uninformed choices for neural network
architecture just to get this running asap, but we might want to 
keep the networks small even when we do this for real.

In [2]:
class Policy(Model):
    min_action = -pi
    max_action = pi
    
    def __init__(self):
        super().__init__()
        self.dense1 = Dense(64, activation='relu', input_shape=(12,))
        self.dense2 = Dense(32, activation='relu')
        self.dense3 = Dense(16, activation='sigmoid')
        self.dropout = Dropout(0.1)
        self.mu = Dense(2)
        self.sigma = Dense(2, activation='softplus')    

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.dropout(x)
        mu = self.mu(x)
        sigma = self.sigma(x)
        return mu, sigma 

# Create an instance of the model
policy = Policy()
inp = np.random.random((12,12))
print(f'input: \n{inp}\n')
out = policy(inp)
print(f'out: \n{out}\n')

dists = tfp.distributions.Normal(*out)
print(f'dists: \n:{dists}\n')

samples = dists.sample()
print(f'samples: \n{samples}\n')

print(f'log probs: {dists.log_prob(samples.numpy())}\n')

policy.summary()

Q = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, activation='relu', input_shape=(14,)),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dense(16, activation='sigmoid'),
  tf.keras.layers.Dropout(0.1),
  tf.keras.layers.Dense(1)
], name='Q(s,a)')
Q.summary()

input: 
[[0.89089983 0.95298144 0.9059315  0.11932926 0.8050881  0.22540096
  0.98777867 0.36924338 0.83736882 0.0030349  0.67462625 0.84088234]
 [0.27208392 0.86886455 0.2120592  0.37161984 0.33431432 0.91715517
  0.39761219 0.4084835  0.86157028 0.62362673 0.10065921 0.64206402]
 [0.43522057 0.58149465 0.07220851 0.68564384 0.9369819  0.37389691
  0.63706199 0.24267845 0.40860661 0.92381217 0.32863031 0.10431835]
 [0.68220894 0.77571316 0.2820934  0.99055826 0.25983036 0.2889713
  0.86186534 0.74197025 0.05819626 0.11379321 0.09623603 0.14810192]
 [0.67717048 0.28667798 0.89050913 0.23011771 0.00768776 0.41337324
  0.28942213 0.9501767  0.30222991 0.85409321 0.0983092  0.8192714 ]
 [0.87115856 0.078061   0.90358854 0.71710551 0.70862932 0.35343858
  0.22799231 0.67016844 0.22101572 0.55708077 0.20279878 0.93289569]
 [0.88494369 0.1934783  0.18960846 0.56395427 0.93767722 0.57615125
  0.94360343 0.42277964 0.24298693 0.32973549 0.22916943 0.30790077]
 [0.28867142 0.21392562 0.72615103

In [3]:
x = np.random.random((10,3))
a = tf.ones((1,3))
with tf.GradientTape() as tape:
    tape.watch(a)
    y = a * x
print(y)
grad = tape.gradient(y, a)
print(f'sum grad: {tf.reduce_sum(x, axis=0)}')
print(f'len: {len(grad)} \n {grad}')

tf.Tensor(
[[0.01290222 0.57355696 0.22830121]
 [0.60535055 0.97910416 0.5047759 ]
 [0.8916844  0.6320375  0.42519212]
 [0.26451167 0.04029564 0.01113232]
 [0.8684255  0.21857104 0.75444806]
 [0.43278542 0.3537271  0.04956343]
 [0.8051149  0.8991891  0.7634457 ]
 [0.96536773 0.7531593  0.5429096 ]
 [0.9140347  0.00581806 0.77771956]
 [0.7628635  0.14116357 0.5663116 ]], shape=(10, 3), dtype=float32)
sum grad: [6.52304062 4.59662251 4.62379947]
len: 1 
 [[6.523041  4.5966215 4.6237993]]


## Instantiate World

In [4]:
world = ParallelTrackNEO()

## Instantiate Agent

In [5]:
agent = ParallelAgent(world, policy, Q, learning_rate_policy=0.00001, learning_rate_Q=0.00056)

## Train Agent

In [12]:
s = np.array((1,2,3,4,5,6,7,8,9,10,11,12))
s = np.expand_dims(s, axis=0)

preds = policy(s)
print(preds)

agent.train(100, 5, 1)

preds = policy(s)
print(preds)

Output()

(<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[1.4530818 , 0.15883997]], dtype=float32)>, <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[1.3200785 , 0.10828248]], dtype=float32)>)


Output()

epoch 0: Mean Reward = [9.80700812e-17]


Output()

epoch 1: Mean Reward = [9.80698539e-17]


Output()

epoch 2: Mean Reward = [9.80698534e-17]


Output()

epoch 3: Mean Reward = [9.80698534e-17]


epoch 4: Mean Reward = [9.80698534e-17]
(<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[1.4530818 , 0.15883997]], dtype=float32)>, <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[1.5243785e-08, 7.2114921e-07]], dtype=float32)>)


In [7]:
r_mean, states = agent.training_step(100)
# print(states)
states = zip(*(state[:3] for state in states))
states = list(states)[0]
# print(states)

plt.plot([x for x in range(100)], states)

(50, 2)


ValueError: operands could not be broadcast together with shapes (50,2,3) (50,1,2) 

In [23]:
agent.train(100, 5, 100)