# imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gymnasium as gym
import random
import torch
import warnings
import pytz
from datetime import datetime, time
from zoneinfo import ZoneInfo
from collections import deque
import os

from stable_baselines3 import PPO
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor

from gymnasium.envs.registration import register
from simglucose.simulation.scenario import CustomScenario

SEED = 1
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
warnings.filterwarnings("ignore")

from src.wrappers import ActionclipWrapper, FeatureWrapper, StackObsWrapper
from src.make_environment import make_env, make_env_stacked, make_env_stacked_for_patient
from src.reward_function import improved_reward
from src.utils import * 
print("import done")

  import pkg_resources


import done


# Register SimGlucose Gymnasium Environment

In [2]:
start_time = datetime.combine(datetime.today(), time(6,0))    # starting at 6 am of today
# Meal schedule (time offset from 6 a.m., carbs in grams):
# (1, 45)   → 7:00 AM   breakfast
# (6, 70)   → 12:00 PM  lunch
# (10, 15)  → 4:00 PM   snack
# (12, 80)  → 6:00 PM   dinner
# (17, 10)  → 11:00 PM   snack
scenario  = CustomScenario(start_time=start_time, scenario=[(1, 45), (6, 70), (10, 15), (12, 80), (17, 10)])


register(
    id="simglucose/adult1-debug-v0",
    entry_point="simglucose.envs:T1DSimGymnaisumEnv",
    max_episode_steps=480,                  # 480 steps, each step is 3 minutes
                                            # 3 x 480 = 1440 minutes ---> 24 hour simulation
    kwargs={"patient_name": "adult#001",        #training on adult #1 only
            "reward_fun": improved_reward,      # reward function
            "custom_scenario": scenario,        # pass deterministic custom scenario
    },
)

# Train Baseline PPO 

In [None]:
def make_env_seeded(rank):
    def _init():
        return make_env(seed=SEED + rank)
    return _init

n_envs = 4
vec_env = DummyVecEnv([make_env_seeded(i) for i in range(n_envs)])
vec_env = VecMonitor(vec_env, filename="ppo_train_monitor_4envs.csv")


# policy_kwargs = dict(net_arch=[128, 128])      #hidden layer
# ---- Train PPO ----
model = PPO(
    "MlpPolicy",
    # env,
    vec_env,
    verbose=1,
    # policy_kwargs=policy_kwargs,
    learning_rate=3e-4,
    n_steps=1024,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    seed = SEED,
    ent_coef=0.1,
    tensorboard_log="./logs/"
)

model.learn(total_timesteps=100_000, tb_log_name="vanilla_ppo")

# # ---- Save ppo model ----
# la = pytz.timezone("America/Los_Angeles")
# timestamp = datetime.now(la).strftime("%Y-%m-%d_%H-%M")
# model.save(f"vanilla_ppo_{timestamp}")


# Train Stacked PPO 

In [None]:
def make_env_stacked_seeded(rank, k=4):
    def _init():
        return make_env_stacked(seed=SEED + rank, k=4)
    return _init

n_envs_stacked = 4
# k = 4
vec_env_stacked = DummyVecEnv([make_env_stacked_seeded(i) for i in range(n_envs_stacked)])
vec_env_stacked = VecMonitor(vec_env_stacked, filename="ppo_train_monitor_stacked_4envs.csv")


# ---- Train PPO with stacked observations ----
model_stacked = PPO(
    "MlpPolicy",
    vec_env_stacked,
    verbose=1,
    # policy_kwargs=policy_kwargs,
    learning_rate=3e-4,
    n_steps=1024,
    batch_size=64,
    n_epochs=10,
    gamma=0.9,
    seed=SEED,
    ent_coef=0.3,
    tensorboard_log="./logs/"
)

model_stacked.learn(total_timesteps=100_000, tb_log_name="ppo_stacked")


# ---- Save stacked ppo model ----
la = pytz.timezone("America/Los_Angeles")
timestamp = datetime.now(la).strftime("%Y-%m-%d_%H-%M")
model_stacked.save(f"ppo_stacked_{timestamp}")
