# RL Agent Test
---

## Imports

In [1]:
import os
import sys
current = os.getcwd()
if (os.path.basename(current) == 'rl_agent'):
    top_level_dir = os.path.dirname(os.getcwd())
else:
    top_level_dir = current
sys.path.append(os.path.abspath(top_level_dir))
os.chdir(top_level_dir)

%reload_ext autoreload
%autoreload 2

from math import pi
import numpy as np
from rl_agent.World import *
from rl_agent.Agent import *

import matplotlib.pyplot as plt

import tensorflow_probability as tfp
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import Model
print(f'TensorFlow version: {tf.__version__}')

2024-06-28 00:26:16.313616: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-28 00:26:16.592489: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-28 00:26:17.613584: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.16.1


## Instantiate Neural Networks for Policy and Q

So for now I am using pretty uninformed choices for neural network
architecture just to get this running asap, but we might want to 
keep the networks small even when we do this for real.

In [2]:
class Policy(Model):
    min_action = -pi
    max_action = pi
    
    def __init__(self):
        super().__init__()
        self.dense1 = Dense(512, activation='relu', input_shape=(12,))
        self.dense2 = Dense(256, activation='relu')
        self.dense3 = Dense(64, activation='sigmoid')
        self.dropout = Dropout(0.1)
        self.mu = Dense(2)
        self.sigma = Dense(2)    

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.dropout(x)
        mu = self.mu(x)
        sigma = self.sigma(x)
        sigma = tf.math.softplus(sigma)
        sigma = tf.clip_by_value(sigma, 1e-2, 2*3.14)
        return mu, sigma 

# Create an instance of the model
policy = Policy()
inp = np.random.random((12,12))
print(f'input: \n{inp}\n')
mu, sigma = policy(inp)
print(f'mu: \n{mu}\nsigma: \n{sigma}\n')

dists = tfp.distributions.Normal(mu, sigma)

print(f'dists: \n:{dists}\n')

samples = dists.sample()
print(f'samples: \n{samples}\n')

print(f'log probs: {dists.log_prob(samples.numpy())}\n')

policy.summary()

Q = tf.keras.models.Sequential([
  tf.keras.layers.Dense(512, activation='relu', input_shape=(14,)),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='sigmoid'),
  tf.keras.layers.Dropout(0.1),
  tf.keras.layers.Dense(1)
], name='Q(s,a)')
Q.summary()

input: 
[[0.42748666 0.9458392  0.9460415  0.35670117 0.83005764 0.91816991
  0.58487537 0.51552538 0.68525477 0.72309771 0.34377917 0.72340027]
 [0.47212723 0.13208062 0.73505553 0.95417377 0.28065273 0.91705073
  0.2803358  0.04114788 0.76907146 0.84822475 0.36851814 0.26537574]
 [0.54229446 0.38571328 0.99404726 0.04492365 0.30922848 0.62649696
  0.25752389 0.25275324 0.60796529 0.01884727 0.59945147 0.81747342]
 [0.32671238 0.37290311 0.53306459 0.42351307 0.83436283 0.85074474
  0.50593595 0.99254072 0.20639378 0.5568514  0.69033099 0.50587156]
 [0.42972094 0.57500695 0.77652682 0.35926912 0.19726382 0.93156316
  0.701689   0.13531798 0.23869297 0.80900829 0.56743514 0.80692068]
 [0.59600368 0.93452305 0.31340575 0.23751725 0.7854374  0.08155778
  0.1305707  0.8668369  0.27749344 0.64670944 0.81267529 0.0551046 ]
 [0.43809052 0.11886712 0.14210065 0.56710102 0.57381682 0.75078019
  0.55218433 0.03191328 0.87484657 0.4805679  0.2564551  0.50024585]
 [0.4854429  0.07305427 0.9475330

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


dists: 
:tfp.distributions.Normal("Normal", batch_shape=[12, 2], event_shape=[], dtype=float32)

samples: 
[[-0.8365582   0.06716502]
 [-0.5356643  -0.6736687 ]
 [-0.68408936 -0.7206426 ]
 [-0.11661416 -0.21892415]
 [-0.9122709  -0.6640986 ]
 [-0.34905568  0.14937052]
 [ 0.00490189 -0.7393435 ]
 [-0.6371808  -0.41522956]
 [-1.1422186  -0.339051  ]
 [ 0.27249408 -0.3461929 ]
 [-1.0238682  -0.646448  ]
 [-0.24048889 -0.06649816]]

log probs: [[-0.38674778 -0.97476983]
 [-0.32057834 -0.12450704]
 [-0.2984279  -0.15991469]
 [-0.7006123  -0.16079015]
 [-0.45702535 -0.08347452]
 [-0.39894927 -1.1866605 ]
 [-1.0028068  -0.2189225 ]
 [-0.29203245  0.05061031]
 [-0.8273389  -0.00885453]
 [-1.6923132  -0.00317501]
 [-0.6256533  -0.08977203]
 [-0.5178704  -0.5208465 ]]



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [3]:
x = np.random.random((10,3))
a = tf.ones((1,3))
with tf.GradientTape() as tape:
    tape.watch(a)
    y = a * x
print(y)
grad = tape.gradient(y, a)
print(f'sum grad: {tf.reduce_sum(x, axis=0)}')
print(f'len: {len(grad)} \n {grad}')

tf.Tensor(
[[0.1722055  0.5455282  0.8676614 ]
 [0.33518815 0.15592375 0.32361728]
 [0.03432227 0.36234856 0.8238042 ]
 [0.34115583 0.24849282 0.3662879 ]
 [0.5952342  0.35049495 0.06810068]
 [0.5452762  0.93510777 0.655711  ]
 [0.19297367 0.46223673 0.12422057]
 [0.8556645  0.53312844 0.64928627]
 [0.6593245  0.2836185  0.3427189 ]
 [0.1626291  0.23043735 0.6069645 ]], shape=(10, 3), dtype=float32)
sum grad: [3.89397396 4.10731712 4.82837269]
len: 1 
 [[3.8939738 4.107317  4.828373 ]]


## Instantiate World

In [4]:
world = ParallelTrackNEO(num_sails=8192)

## Instantiate Agent

In [5]:
agent = ParallelAgent(world, policy, Q, learning_rate_policy=0.00001, learning_rate_Q=0.00056)

## Train Agent

In [6]:
policy.load_weights('./checkpoints/policy.weights.h5')
Q.load_weights('./checkpoints/Q.weights.h5')

In [None]:
s = np.array((1,2,3,4,5,6,7,8,9,10,11,12))
s = np.expand_dims(s, axis=0)

preds = policy(s)
print(preds)

EPOCHS = 750
EPISODES = 5

def u(epoch, episode):
    progress_remaining = 1 - (epoch * EPISODES + episode) / (EPOCHS * EPISODES)
    return min(progress_remaining / 0.5, 1) * 1.5

i_0 = 55 + 88 + 75 + 23 + 74

u_shifted = lambda epoch, episode: u(epoch + i_0, episode)

agent.train(300, EPISODES, EPOCHS-i_0, added_uncertainty=u)

preds = policy(s)
print(preds)

Output()

(<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 1.6609491, -0.6109071]], dtype=float32)>, <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.01, 0.01]], dtype=float32)>)


Output()

epoch 0: Mean Reward = 0.0011144552022213441


Output()

epoch 1: Mean Reward = 0.0004018601537040011


Output()

epoch 2: Mean Reward = 0.002318058661872676


Output()

epoch 3: Mean Reward = 0.0004939402980151933


Output()

epoch 4: Mean Reward = 0.0005460370048455229


Output()

epoch 5: Mean Reward = 0.0001933205564154689


Output()

epoch 6: Mean Reward = 0.00014909616326115458


Output()

epoch 7: Mean Reward = 0.0005122492243877097


Output()

epoch 8: Mean Reward = 0.00016549200661707271


Output()

epoch 9: Mean Reward = 0.0004397604895474565


Output()

epoch 10: Mean Reward = 0.00016415304973364968


Output()

epoch 11: Mean Reward = 0.012213882328643861


Output()

epoch 12: Mean Reward = 0.0002267242029293181


Output()

epoch 13: Mean Reward = 0.00017382692932503148


Output()

epoch 14: Mean Reward = 0.0014605647518941864


Output()

epoch 15: Mean Reward = 0.01542167920366226


Output()

epoch 16: Mean Reward = 0.000404498396710215


Output()

epoch 17: Mean Reward = 0.00015803852686185882


Output()

epoch 18: Mean Reward = 0.0004822588906294383


Output()

epoch 19: Mean Reward = 0.0001855040805860687


Output()

epoch 20: Mean Reward = 0.00017032513130360713


Output()

epoch 21: Mean Reward = 0.004064281052282268


Output()

epoch 22: Mean Reward = 0.0002540271766003742


Output()

epoch 23: Mean Reward = 0.0010206720997776183


Output()

epoch 24: Mean Reward = 0.0001529391039640306


Output()

epoch 25: Mean Reward = 0.00048956983208898


Output()

epoch 26: Mean Reward = 0.0001480361152422026


Output()

epoch 27: Mean Reward = 0.002445408721471945


Output()

epoch 28: Mean Reward = 0.00014851900370245607


Output()

epoch 29: Mean Reward = 0.0001632653513802653


Output()

epoch 30: Mean Reward = 0.0007952271005177004


Output()

epoch 31: Mean Reward = 0.0004003165047621063


Output()

epoch 32: Mean Reward = 0.00022856939749967437


Output()

epoch 33: Mean Reward = 0.0008421872588711878


Output()

epoch 34: Mean Reward = 0.0017841565312970338


Output()

epoch 35: Mean Reward = 0.00040981356138222204


Output()

epoch 36: Mean Reward = 0.00020911859609739822


Output()

epoch 37: Mean Reward = 0.00023860294690428434


Output()

epoch 38: Mean Reward = 0.0007277211275928394


Output()

epoch 39: Mean Reward = 0.005981318148623255


Output()

epoch 40: Mean Reward = 0.00015698242886784234


Output()

epoch 41: Mean Reward = 0.0014086421177015554


Output()

epoch 42: Mean Reward = 0.0003893986460464894


Output()

epoch 43: Mean Reward = 0.00016976353483972673


Output()

epoch 44: Mean Reward = 0.00015638863942825365


Output()

epoch 45: Mean Reward = 0.0005537945626564206


Output()

epoch 46: Mean Reward = 0.0009658497655471684


Output()

epoch 47: Mean Reward = 0.00018063498029459668


Output()

epoch 48: Mean Reward = 0.00016216128243043242


Output()

epoch 49: Mean Reward = 0.0002469615175048239


Output()

epoch 50: Mean Reward = 0.00018681314213538062


Output()

epoch 51: Mean Reward = 0.00021725295536580123


Output()

epoch 52: Mean Reward = 0.0010669672262243303


Output()

epoch 53: Mean Reward = 0.0002730515198259236


Output()

epoch 54: Mean Reward = 0.00015699163524634754


Output()

epoch 55: Mean Reward = 0.00016308372100696166


Output()

epoch 56: Mean Reward = 0.00016866313235048983


Output()

epoch 57: Mean Reward = 0.0002794785805847715


Output()

epoch 58: Mean Reward = 0.0003065279612699352


Output()

epoch 59: Mean Reward = 0.0002219640709376352


Output()

epoch 60: Mean Reward = 0.002697836588330651


Output()

epoch 61: Mean Reward = 0.004323074977424551


Output()

epoch 62: Mean Reward = 0.00033277105377501834


Output()

epoch 63: Mean Reward = 0.00040681442842288834


Output()

epoch 64: Mean Reward = 0.0001815418251311925


Output()

epoch 65: Mean Reward = 0.0008549653006105519


Output()

epoch 66: Mean Reward = 0.0001581278697342644


Output()

epoch 67: Mean Reward = 0.0002477190644629926


Output()

epoch 68: Mean Reward = 0.0002864789670154868


Output()

epoch 69: Mean Reward = 0.0007343393497859718


Output()

epoch 70: Mean Reward = 0.000177963012070185


Output()

epoch 71: Mean Reward = 0.0022729028866780776


Output()

epoch 72: Mean Reward = 0.001659601114554007


Output()

epoch 73: Mean Reward = 0.002647647246634838


Output()

epoch 74: Mean Reward = 0.00035077478204915833


In [None]:
r_mean, states = agent.training_step(100)
# print(states)
states = zip(*(state[:3] for state in states))
states = list(states)[0]
# print(states)

plt.plot([x for x in range(100)], states)

In [None]:
agent.train(100, 5, 100)

In [None]:
print(world.V)