# RL Agent Test
---

## Imports

In [1]:
import os
import sys
current = os.getcwd()
if (os.path.basename(current) == 'rl_agent'):
    top_level_dir = os.path.dirname(os.getcwd())
else:
    top_level_dir = current
sys.path.append(os.path.abspath(top_level_dir))
os.chdir(top_level_dir)

%reload_ext autoreload
%autoreload 2

from math import pi
import numpy as np
from rl_agent.World import *
from rl_agent.Agent import *

import matplotlib.pyplot as plt

import tensorflow_probability as tfp
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import Model
print(f'TensorFlow version: {tf.__version__}')

2024-07-10 00:00:14.319395: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-10 00:00:14.635631: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-10 00:00:15.994405: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.16.1


## Instantiate Neural Networks for Policy and Q

So for now I am using pretty uninformed choices for neural network
architecture just to get this running asap, but we might want to 
keep the networks small even when we do this for real.

In [2]:
class Policy(Model):
    min_action = -pi
    max_action = pi
    
    def __init__(self):
        super().__init__()
        self.dense1 = Dense(512, activation='relu', input_shape=(12,))
        self.dense2 = Dense(256, activation='relu')
        self.dense3 = Dense(64, activation='sigmoid')
        self.dropout = Dropout(0.1)
        self.mu = Dense(2)
        self.sigma = Dense(2)    

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.dropout(x)
        mu = self.mu(x)
        sigma = self.sigma(x)
        sigma = tf.math.softplus(sigma)
        sigma = tf.clip_by_value(sigma, 1e-2, 2*3.14)
        return mu, sigma 

# Create an instance of the model
policy = Policy()
inp = np.random.random((12,12))
print(f'input: \n{inp}\n')
mu, sigma = policy(inp)
print(f'mu: \n{mu}\nsigma: \n{sigma}\n')

dists = tfp.distributions.Normal(mu, sigma)

print(f'dists: \n:{dists}\n')

samples = dists.sample()
print(f'samples: \n{samples}\n')

print(f'log probs: {dists.log_prob(samples.numpy())}\n')

policy.summary()

Q = tf.keras.models.Sequential([
  tf.keras.layers.Dense(512, activation='relu', input_shape=(14,)),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='sigmoid'),
  tf.keras.layers.Dropout(0.1),
  tf.keras.layers.Dense(1)
], name='Q(s,a)')
Q.summary()

input: 
[[0.21574525 0.10988273 0.05850682 0.03009997 0.81732478 0.80933166
  0.43208174 0.6822275  0.91312252 0.8399191  0.14135623 0.65051274]
 [0.53796858 0.25278412 0.04968544 0.12968618 0.05940228 0.86892154
  0.93777385 0.79703539 0.57650463 0.63977048 0.38018108 0.93927684]
 [0.80637813 0.81155471 0.21025867 0.25732067 0.41228665 0.51004588
  0.84040104 0.97844523 0.24849833 0.38439895 0.19444772 0.05307451]
 [0.71901133 0.37372816 0.81050098 0.09599546 0.56807678 0.79502995
  0.46275102 0.27385331 0.38542714 0.42736086 0.5696032  0.80254705]
 [0.54163532 0.40328563 0.60870626 0.13830386 0.12660007 0.31350151
  0.71776937 0.77945998 0.07465509 0.84562134 0.68192122 0.42129766]
 [0.09602168 0.49753659 0.65842885 0.96184159 0.41004064 0.26683698
  0.54215604 0.56716694 0.9940816  0.55559756 0.51498029 0.52906143]
 [0.81690111 0.71194521 0.06734113 0.06921543 0.4719599  0.83898403
  0.68170425 0.62833499 0.3320922  0.81385474 0.42534433 0.36528612]
 [0.93688932 0.98775873 0.1621199

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


mu: 
[[ 0.11291885 -0.51043475]
 [ 0.13734558 -0.5519241 ]
 [ 0.1246672  -0.5393718 ]
 [ 0.13641906 -0.54620826]
 [ 0.1343405  -0.5476348 ]
 [ 0.12599376 -0.52566004]
 [ 0.12380768 -0.5385176 ]
 [ 0.1301713  -0.5449206 ]
 [ 0.141285   -0.545404  ]
 [ 0.12114954 -0.5324259 ]
 [ 0.11972442 -0.5221084 ]
 [ 0.13851593 -0.5386995 ]]
sigma: 
[[1.5506083  0.51370573]
 [1.5480157  0.5161439 ]
 [1.5298654  0.51399255]
 [1.5359479  0.5175876 ]
 [1.5325989  0.5094009 ]
 [1.5553108  0.5094363 ]
 [1.5334777  0.51139534]
 [1.5395983  0.51065886]
 [1.530325   0.5129611 ]
 [1.5398266  0.5141945 ]
 [1.5354098  0.50917065]
 [1.5405512  0.51356226]]

dists: 
:tfp.distributions.Normal("Normal", batch_shape=[12, 2], event_shape=[], dtype=float32)

samples: 
[[-0.72375166 -0.7973194 ]
 [-1.8974857  -0.28456476]
 [ 1.1087118  -0.574577  ]
 [ 0.35302937  0.00859398]
 [-0.44551274 -1.2633338 ]
 [-1.219359   -0.9503228 ]
 [ 1.839913   -0.4107876 ]
 [-3.443985   -0.7227398 ]
 [ 0.4300957  -0.36488736]
 [-1.24905

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [5]:
x = np.random.random((10,3))
a = tf.ones((1,3))
with tf.GradientTape() as tape:
    tape.watch(a)
    y = a * x
print(y)
grad = tape.gradient(y, a)
print(f'sum grad: {tf.reduce_sum(x, axis=0)}')
print(f'len: {len(grad)} \n {grad}')

tf.Tensor(
[[0.57365286 0.99998564 0.45503005]
 [0.5470483  0.51108766 0.11713102]
 [0.06066374 0.3374873  0.43010595]
 [0.8523678  0.24602309 0.83591974]
 [0.6146412  0.55648106 0.9979505 ]
 [0.19782154 0.13640805 0.5214073 ]
 [0.60928446 0.4146341  0.2102699 ]
 [0.35024825 0.4078744  0.85510784]
 [0.84011066 0.84098536 0.18403797]
 [0.14271797 0.96552235 0.79604787]], shape=(10, 3), dtype=float32)
sum grad: [4.78855674 5.41648906 5.40300817]
len: 1 
 [[4.7885566 5.416489  5.4030085]]


## Instantiate World

In [6]:
world = ParallelTrackNEO(num_sails=8192)

## Instantiate Agent

In [7]:
agent = ParallelAgent(world, policy, Q, learning_rate_policy=0.00001, learning_rate_Q=0.00056)

## Train Agent

In [7]:
policy.load_weights('./checkpoints/policy.weights.h5')
Q.load_weights('./checkpoints/Q.weights.h5')

In [None]:
s = np.array((1,2,3,4,5,6,7,8,9,10,11,12))
s = np.expand_dims(s, axis=0)

preds = policy(s)
print(preds)

EPOCHS = 300
EPISODES = 5

def u(epoch, episode):
    prog_rem = 1 - (epoch * EPISODES + episode) / (EPOCHS * EPISODES - 1)
    f = lambda x: x
    maxval = 1.5
    endpt = 0.667
    return max((f(prog_rem) - f(1 - endpt)) / (f(1) - f(1 - endpt)), 0) * maxval

i_0 = 74

u_shifted = lambda epoch, episode: u(epoch + i_0, episode)

agent.train(300, EPISODES, EPOCHS-i_0, added_uncertainty=u)

preds = policy(s)
print(preds)

Output()

(<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 0.48627782, -0.26656562]], dtype=float32)>, <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[1.6943283, 0.4989843]], dtype=float32)>)


Output()

epoch 0: Mean Reward = -0.00010907821941455283


Output()

epoch 1: Mean Reward = -0.0020204012194123184


Output()

epoch 2: Mean Reward = -0.004701796356813348


Output()

epoch 3: Mean Reward = -0.004498408128258717


Output()

epoch 4: Mean Reward = -0.0031481099013101637


Output()

epoch 5: Mean Reward = -0.0025487167518063814


Output()

epoch 6: Mean Reward = -0.0022124001655933227


Output()

epoch 7: Mean Reward = -0.0032195566587914894


Output()

epoch 8: Mean Reward = -0.007196343728766033


Output()

epoch 9: Mean Reward = -0.005709761810797997


Output()

epoch 10: Mean Reward = 0.0059316610290162325


Output()

epoch 11: Mean Reward = -0.0029227613675247444


Output()

epoch 12: Mean Reward = -0.005125254015388032


Output()

epoch 13: Mean Reward = 0.003558603776420655


Output()

epoch 14: Mean Reward = 0.08397456438324444


Output()

epoch 15: Mean Reward = -0.002437997572648117


Output()

epoch 16: Mean Reward = -0.002742799685951118


Output()

epoch 17: Mean Reward = -0.0024641408715940184


Output()

epoch 18: Mean Reward = -0.004637494846304165


Output()

epoch 19: Mean Reward = -0.0043237315316112844


Output()

epoch 20: Mean Reward = -0.005183147988646252


Output()

epoch 21: Mean Reward = -0.005855364780353138


Output()

epoch 22: Mean Reward = -0.005978990933746555


Output()

epoch 23: Mean Reward = -0.005462800494682395


Output()

epoch 24: Mean Reward = 0.0031132510401293487


Output()

epoch 25: Mean Reward = -0.005035681055305026


Output()

epoch 26: Mean Reward = -0.0041276876117349644


Output()

epoch 27: Mean Reward = -0.0029284876942005803


Output()

epoch 28: Mean Reward = -0.0016769181947672931


Output()

epoch 29: Mean Reward = -0.004822565679381987


Output()

epoch 30: Mean Reward = -0.0028301566494639734


Output()

epoch 31: Mean Reward = 0.0032085411532121605


Output()

epoch 32: Mean Reward = -0.006299344331298751


Output()

epoch 33: Mean Reward = 0.0006151211533127434


Output()

epoch 34: Mean Reward = -0.002566701096988297


Output()

epoch 35: Mean Reward = -0.004596892947696407


Output()

epoch 36: Mean Reward = -0.0021122560865614965


Output()

epoch 37: Mean Reward = -0.004739807636812201


Output()

epoch 38: Mean Reward = -0.0027370967902059966


Output()

epoch 39: Mean Reward = -0.006470349194927193


Output()

epoch 40: Mean Reward = -0.006702523121852399


Output()

epoch 41: Mean Reward = 0.027180485556293037


Output()

epoch 42: Mean Reward = -0.006815565678375633


Output()

epoch 43: Mean Reward = -0.005897128018638037


Output()

epoch 44: Mean Reward = -0.002258428269313495


Output()

epoch 45: Mean Reward = -0.004840455375187588


Output()

epoch 46: Mean Reward = -0.004358064964872379


Output()

epoch 47: Mean Reward = -0.005273000438562385


Output()

epoch 48: Mean Reward = -0.002852952998140576


Output()

epoch 49: Mean Reward = -0.004848294490550583


Output()

epoch 50: Mean Reward = -0.0016634236984564153


Output()

epoch 51: Mean Reward = -0.0035448245458981845


Output()

epoch 52: Mean Reward = -0.0031343356893237476


Output()

epoch 53: Mean Reward = -0.003726491691288957


Output()

epoch 54: Mean Reward = -0.002962773853946346


Output()

epoch 55: Mean Reward = -0.007306951950939339


Output()

epoch 56: Mean Reward = 0.00010569710815645116


Output()

epoch 57: Mean Reward = -0.003647788266026358


Output()

epoch 58: Mean Reward = -0.0032430134384192357


Output()

epoch 59: Mean Reward = -0.005768719994733046


Output()

epoch 60: Mean Reward = -0.0023666577639362037


Output()

epoch 61: Mean Reward = 0.04244581973230818


Output()

epoch 62: Mean Reward = -0.0022681318098076025


Output()

epoch 63: Mean Reward = 0.07095665599393952


Output()

epoch 64: Mean Reward = 0.015901069869721444


Output()

epoch 65: Mean Reward = -0.007741067697595135


Output()

epoch 66: Mean Reward = -0.003462916190645774


Output()

epoch 67: Mean Reward = -0.00584494017910271


Output()

epoch 68: Mean Reward = -0.003354297744812246


Output()

epoch 69: Mean Reward = -0.007682459318462894


Output()

epoch 70: Mean Reward = -0.002255659713398886


Output()

epoch 71: Mean Reward = -0.005214138051269229


Output()

epoch 72: Mean Reward = 0.0020011129013737215


Output()

epoch 73: Mean Reward = -0.004687000996144795


Output()

epoch 74: Mean Reward = -0.0021573810822433414


In [None]:
r_mean, states = agent.training_step(100)
# print(states)
states = zip(*(state[:3] for state in states))
states = list(states)[0]
# print(states)

plt.plot([x for x in range(100)], states)

In [None]:
agent.train(100, 5, 100)

In [None]:
print(world.V)