In [None]:
import sys
import os


os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
sys.path.append("..")
from model_cpp.model_env_cpp import CellEnvironment, transform_densities
import cppCellModel
from DDPG_DENSNET.OUActionNoise import OUActionNoise
from DDPG_DENSNET.algorithm import get_actor, get_critic, policy, update_target, learn
import tensorflow as tf
from DDPG_DENSNET.Buffer import Buffer
# from model.cell_environment import CellEnvironment
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import load_model
import tensorflow as tf
import json
import os
from sklearn.preprocessing import MinMaxScaler

env = CellEnvironment('segmentation', False, 'dose', 'AC', True)

actor_model = get_actor()
critic_model = get_critic()
#actor_model = load_model('tmp/actor_DENSNET.h5')
#critic_model = load_model('tmp/critic_model_DENSNET.h5')

actor_model.summary()
critic_model.summary()
target_actor = get_actor()
target_critic = get_critic()

# Making the weights equal initially
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

# Learning rate for actor-critic models
critic_lr = 0.00006
actor_lr  = 0.00004

critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

total_episodes =3
# Discount factor for future rewards
gamma = 0.99
# Used to update target networks
tau = 0.0001

buffer = Buffer(50000, 64)

# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []

observation_dimensions = (50, 50, 3)
mean_reward, terminals, episodes, mean_dose, mean_time = {}, {}, {}, {}, {}

# Takes about 4 min to train
for ep in range(total_episodes):
    env = CellEnvironment('segmentation', False, 'dose', 'AC', True)
    _ = env.reset(-1)

    obs_dim1 = np.array(env.observe()).squeeze() * (255.0)
    obs_dim2 = cppCellModel.observeGlucose(env.controller_capsule) *(255/5300)
    obs_dim3 = cppCellModel.observeOxygen(env.controller_capsule) *(255/170000)
    prev_state = tf.convert_to_tensor(np.array([obs_dim1, obs_dim2, obs_dim3]).reshape((50, 50, 3)))

    episodic_reward = 0
    iter = 1
    sum_dose = 0
    sum_time = 0

    # early stopping
    best_reward = -np.inf
    patience = 50
    no_improvement_count = 0
    ep_list = []
    avg_ep = []

    while True:

        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

        cond=True

        std_dev1 = 0.001
        std_dev2 = 0.001

        ou_noise = [OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev1) * np.ones(1)),
                    OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev2) * np.ones(1))]

        action ,saction = policy(actor_model, tf_prev_state, ou_noise, cond)
        # Recieve state and reward from environment.
        # print("Acting ...")
        reward, dose, time, KH = env.act(action)

        obs_dim1 = np.array(env.observe()).squeeze() * (255.0)
        obs_dim2 = cppCellModel.observeGlucose(env.controller_capsule) *(255/5300)
        obs_dim3 = cppCellModel.observeOxygen(env.controller_capsule) *(255/170000)
        state = tf.convert_to_tensor(np.array([obs_dim1, obs_dim2, obs_dim3]).reshape((50, 50, 3)))

        done, which_terminal = env.inTerminalState()
        # print("Recording ...")
        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward
        print("Reward : {:.4f}  *  dose : {:.1f}  *  time : {:2}  *  epiReward : {:.4f}  *  Sampled Action : [{:.4f}, {:.4f}]  *  hcell_killed : {:5.2f}".format(reward,dose,time,episodic_reward,saction[0],saction[1], KH))
        if buffer.buffer_counter > 64:
            # print("Learning ...")
            learn(buffer,
                  target_actor, target_critic, critic_model, actor_model,
                  critic_optimizer, actor_optimizer,
                  gamma)
            # print("Updating ...")
            update_target(target_actor.variables, actor_model.variables, tau)
            update_target(target_critic.variables, critic_model.variables, tau)
            # print(done)
            # End this episode when `done` is True
        if done:
            terminals[ep] = which_terminal
            episodes[ep] = iter
            break

        prev_state = state
        sum_dose += dose
        sum_time += time
        iter += 1

        ep_list.append(reward)
        avg_ep = np.mean(ep_list[-30:])

        if avg_ep > best_reward:
            # actor_model.save('tmp/best_actor_DENSNET.h5')
            # critic_model.save('tmp/best_critic_model_DENSNET.h5')
            best_reward = avg_ep
            no_improvement_count = 0
        else:
            no_improvement_count += 1
            if no_improvement_count >= patience:
                print(f"No improvement for {patience} episodes. Stopping early.")
                break

    ep_reward_list.append(episodic_reward)

    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-5:])
    print("Episode * {:3} * Avg Reward is ==> {:.4f}  *  Last Reward is ==> {:.4f}".format(ep, avg_reward, ep_reward_list[-1]))
    avg_reward_list.append(avg_reward)
    mean_reward[ep + 1] = avg_reward
    mean_dose[ep + 1] = sum_dose / iter
    mean_time[ep + 1] = sum_time / iter

    path = "./tmp"
    if not os.path.exists(path):
        os.makedirs(path)

    actor_model.save('tmp/actor_DENSNET.h5')
    critic_model.save('tmp/critic_model_DENSNET.h5')

    # save mean reward, terminals, num of iterations per episode, mean dose, mean time
    with open('../DDPG_DENSNET/tmp/mean_reward_realtime.txt', 'w') as mean_reward_file:
        mean_reward_file.write(json.dumps(mean_reward))

    with open('../DDPG_DENSNET/tmp/episodes_realtime.txt', 'w') as episodes_file:
        episodes_file.write(json.dumps(episodes))

    with open('../DDPG_DENSNET/tmp/terminals_realtime.txt', 'w') as terminals_file:
        terminals_file.write(json.dumps(terminals))

    with open('../DDPG_DENSNET/tmp/mean_dose_realtime.txt', 'w') as mean_dose_file:
        mean_dose_file.write(json.dumps(mean_dose))

    with open('../DDPG_DENSNET/tmp/mean_time_realtime.txt', 'w') as mean_time_file:
        mean_time_file.write(json.dumps(mean_time))

path = "./tmp"
if not os.path.exists(path):
    os.makedirs(path)

actor_model.save('tmp/actor_DENSNET.h5')
critic_model.save('tmp/critic_model_DENSNET.h5')

# save mean reward, terminals, num of iterations per episode, mean dose, mean time
with open('../DDPG_DENSNET/tmp/mean_reward.txt', 'w') as mean_reward_file:
    mean_reward_file.write(json.dumps(mean_reward))

with open('../DDPG_DENSNET/tmp/episodes.txt', 'w') as episodes_file:
    episodes_file.write(json.dumps(episodes))

with open('../DDPG_DENSNET/tmp/terminals.txt', 'w') as terminals_file:
    terminals_file.write(json.dumps(terminals))

with open('../DDPG_DENSNET/tmp/mean_dose.txt', 'w') as mean_dose_file:
    mean_dose_file.write(json.dumps(mean_dose))

with open('../DDPG_DENSNET/tmp/mean_time.txt', 'w') as mean_time_file:
    mean_time_file.write(json.dumps(mean_time))

# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()


In [None]:
import gc
gc.collect()

In [None]:
action ,saction = policy(actor_model, tf_prev_state, ou_noise, cond=False)
action, saction

In [None]:
actor_model.layers[-2].weights[0].numpy().mean()

In [None]:
np.clip(9.9970794e-01, 0.0, 1.0)

In [None]:
from tqdm import tqdm
env = CellEnvironment('segmentation', False, 'dose', 'AC', True)
X=np.zeros((100,50,50,3))
for i in tqdm(range(100)):
    _ = env.reset(-1)
    obs_dim1 = np.array(env.observe()).squeeze()
    obs_dim2 = cppCellModel.observeGlucose(env.controller_capsule)
    obs_dim3 = cppCellModel.observeOxygen(env.controller_capsule)
    X[i,:,:,0]=obs_dim1
    X[i,:,:,1]=obs_dim2
    X[i,:,:,2]=obs_dim3

In [None]:
plt.plot(avg_reward_list)

In [None]:
reward, dose, time = env.act(action)

In [None]:
reward

In [None]:
X[:,:,:,0].max(), X[:,:,:,1].max(), X[:,:,:,2].max()

In [None]:
X[:,:,:,0].max(), X[:,:,:,1].max(), X[:,:,:,2].max()

In [None]:
X[:,:,:,0].max(), X[:,:,:,1].max(), X[:,:,:,2].max()

In [None]:
plt.hist(X[:,:,:,0].flatten())

In [None]:
action

In [None]:
actor_model.save('tmp/actor_DENSNET.h5')
critic_model.save('tmp/critic_model_DENSNET.h5')

In [None]:
import sys
import numpy as np
sys.path.append("..")
from DDPG_DENSNET.OUActionNoise import OUActionNoise
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))

In [None]:
(np.abs(np.sin(a))*0.1*(1-(a/np.max(a))))

In [None]:
import matplotlib.pyplot as plt
a=np.arange(50)
plt.figure()
plt.plot(np.abs(np.sin(a))*0.2*(1-(a/np.max(a))))
plt.plot(np.abs(np.cos(a))*0.2*(1-(a/np.max(a))))
plt.show()