In [1]:
import sys
import os
import numpy as np
from pathlib import Path
import datetime 
import matplotlib
import matplotlib.pyplot as plt

import torch
from torch import nn
from torchvision import transforms as T
from PIL import Image

from models.RLModelWrapper import TDWrapper
from models.Builders import build_fc_dqnn
from loggers import MetricLogger

# Gym is an OpenAI toolkit for RL
import gymnasium as gym
from gymnasium.spaces import Box
from gymnasium.wrappers import FrameStack, GrayScaleObservation

# NES Emulator for OpenAI Gym
# from nes_py.wrappers import JoypadSpace

# Super Mario environment for OpenAI Gym
# import gym_super_mario_bros

gym.envs.registration.registry.keys()

# TODO: Make into reproduciblity function call 
seed = 0
torch.manual_seed(seed)
np.random.seed(seed)
# torch.backends.cudnn.benchmark = False
# This is going to be necessary for perfect reproducibility
# print(os.environ["CUBLAS_WORKSPACE_CONFIG"])
# torch.use_deterministic_algorithms(True)


is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

<contextlib.ExitStack at 0x7fc080546090>

In [2]:
def decay_to_halflife(decay: float) -> float: 
    # p * decay^halflife = .5p 
    return np.log(.5) / np.log(decay)

def halflife_to_decay(halflife: float) -> float: 
    # p * decay^halflife = .5p
    return np.power(.5, 1.0 / halflife)

print(decay_to_halflife(.9999986))
halflife_to_decay(1000)

def plot_means(values_t: torch.tensor, meanlen: int):
    # Take 100 episode averages and plot them too
    if len(values_t) >= meanlen:
        means = values_t.unfold(0, meanlen, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(meanlen - 1), means))
        plt.plot(means.numpy())
    
def plot_values(values, show_result=False, stop_episode=None):
    plt.figure(1)
    values_t = torch.tensor(values, dtype=torch.float)
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(values_t.numpy())
    plot_means(values_t, 100)
    
    if stop_episode is not None: 
        plt.axvline(stop_episode, c = 'r')

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())

495104.7823834813


In [3]:

env = gym.make("CartPole-v1", apply_api_compatibility=False)

obs, _ = env.reset()
next_state, reward, done, trunc, info = env.step(action=0)
print(f"{next_state.shape},\n {reward},\n {done},\n {info}")
print("============================")

(4,),
 1.0,
 False,
 {}


In [4]:
# Model Definition 
model_info = dict(
    state_dim = 4,
    action_dim = env.action_space.n,
    save_dir = Path("checkpoints") / "CartPolev1",
    batch_size = 128,
    hidden_layers = 1, 
    layer_size = 128,
    learn_every = 1,
    learning_rate = .0001, 
    exploration_rate = .9,
    exploration_rate_decay = .999,
    exploration_rate_min = 0.0,
    discount = .99,
    replay_sample_strategy = "priority",
    memory_capacity = 10000,
    burnin = 128,
    sync_every = 1,
    sync_strategy = "soft", 
    sync_rate = .005
)

# increase learning rate, decrease exploration_rate_decay
# increase memory size
# shrink network -- slow training could be due to network size (which is certainly too large)

net = build_fc_dqnn(model_info["state_dim"], model_info["action_dim"], model_info)
model = TDWrapper(net, model_info)
try: 
    model.get_checkpoint()
except: 
    print("Failed to get model checkpoint. Starting from scratch")
    
logger = MetricLogger(model_info["save_dir"], rolling_avg_len = 10, record_every = 10, print_every = 100)

Failed to get model checkpoint. Starting from scratch


In [None]:
n_episodes = 500
start_ep = model.episodes
end_ep = start_ep + n_episodes
end_burnin = None
for e in range(start_ep, end_ep):
    state = env.reset()
    while True:
        action = model.act(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        model.cache(state = state, next_state = next_state, 
                    action = action, reward = reward, done = done)
        q, loss = model.learn()
        if model.curr_step > model.burnin and end_burnin is None: 
            end_burnin = e

        # Logging
        logger.log_step(reward = reward, loss = loss, q = q)

        # Update state
        state = next_state

        # Check if end of game
        if done: #or info["flag_get"]:
            stats = model.end_episode()
            break

    logger.log_episode(stats)
#     plot_values(logger.ep_rewards, stop_episode = end_burnin)

logger.record(stats, do_print = True)
# plot_values(logger.ep_rewards, show_result = True, stop_episode = end_burnin)

Episode 0 - Step 21 - Epsilon 0.8812878083682346 - Mean Reward 21.0 - Mean Length 21.0 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 0.63 - Time 2023-05-04T14:46:49
Episode 100 - Step 10045 - Epsilon 3.8866169417937074e-05 - Mean Reward 265.3 - Mean Length 265.3 - Mean Loss 3.856 - Mean Q Value 31.833 - Time Delta 10.237 - Time 2023-05-04T14:47:30
Episode 200 - Step 32739 - Epsilon 5.3550254585970225e-15 - Mean Reward 160.3 - Mean Length 160.3 - Mean Loss 4.991 - Mean Q Value 61.299 - Time Delta 5.449 - Time 2023-05-04T14:48:46
Episode 300 - Step 74562 - Epsilon 3.59894835973496e-33 - Mean Reward 500.0 - Mean Length 500.0 - Mean Loss 20.301 - Mean Q Value 68.098 - Time Delta 16.528 - Time 2023-05-04T14:51:05
Episode 400 - Step 123927 - Epsilon 1.2779147730968398e-54 - Mean Reward 491.8 - Mean Length 491.8 - Mean Loss 13.978 - Mean Q Value 61.729 - Time Delta 16.428 - Time 2023-05-04T14:53:50


In [None]:
if True: 
    render_env = gym.make("CartPole-v1", render_mode="human", apply_api_compatibility=False)

    state = render_env.reset()
    total_reward = 0
    while True:
        action = model.act(state)#, exploit_only = True)
        next_state, reward, terminated, truncated, info = render_env.step(action)
        done = terminated or truncated
        state = next_state
        # Check if end of game
        if done: #or info["flag_get"]:
            break

    render_env.close()

In [None]:
# Build some tests.
# This was to see if memory worked the same in my module and the working one. 
# Then to see if the td computations were equivalent. 

transitions = []
state, _ = env.reset()
for _ in range(20):
    action = model.act(state)
    next_state, reward, done, trunc, info = env.step(action)
    transitions.append((state, next_state, action, reward, done))
    state = next_state
    # Check if end of game
    if done: #or info["flag_get"]:
        break
    

In [None]:
for t in transitions: 
    print(t[0], t[4])

In [None]:
mem_cap = 20
# Replay Memory 
from collections import namedtuple, deque
from typing import List
Transition = namedtuple("Transition", ("state", "action", "next_state", "reward", "done"))


class ReplayMemory:
    def __init__(self, memory_capacity: int):
        self.mem_cap = memory_capacity
        self.reset_memory()
        
    def __len__(self):
        return len(self.memory)
    
    def reset_memory(self):
        self.memory = deque([], maxlen = self.mem_cap)
    
    def append(self, *args) -> None: 
        self.memory.append(Transition(*args))
        
    def sample(self, batchsize: int, inds: List[int] = None) -> List[Transition]:
        if inds is None: 
            inds = np.random.choice(len(self.memory), batchsize)
        batch = [self.memory[ind] for ind in inds]
        return batch
    def clear(self):
        self.reset_memory()
        
        
memory = ReplayMemory(memory_capacity = mem_cap)

In [None]:
import copy 
model_info_test = copy.deepcopy(model_info)
model_info_test["memory_capacity"] = mem_cap
model_info_test["batch_size"] = 4
print(model_info_test)
net = build_fc_dqnn(model_info_test["state_dim"], model_info_test["action_dim"], model_info_test)
model = TDWrapper(net, model_info_test)

In [None]:
for t in transitions: 
    model.cache(*t)
    state, next_state, action, reward, done = t
    state = torch.tensor(state, dtype=torch.float32, device="cuda").unsqueeze(0)
    next_state = torch.tensor(next_state, dtype=torch.float32, device="cuda").unsqueeze(0)
    action = torch.tensor([[action]], device="cuda")

    reward = torch.tensor([[reward]], device="cuda")
    done = torch.tensor([[done]], device="cuda")
    # Store the transition in memory
    memory.append(state, action, next_state, reward, done)
    
print("======= Model Memory ===========")
print(len(model.memory))
print("======== Memory ===========")
print(len(memory.memory))

In [None]:
print("========= Model Batch =========")
batch_inds, *model_batch = model.recall()
print(batch_inds)
for x in model_batch:
    print(x)

In [None]:
print("========= Memory Batch ==============")
memory_trans = memory.sample(model_info_test["batch_size"], inds = batch_inds)
memory_batch = Transition(*zip(*memory_trans))

print(memory_batch.action)
state_batch = torch.cat(memory_batch.state)
next_state_batch = torch.cat(memory_batch.next_state)
action_batch = torch.cat(memory_batch.action)
reward_batch = torch.cat(memory_batch.reward)
done_batch = torch.cat(memory_batch.done)
memory_batch = [state_batch, next_state_batch, action_batch, reward_batch, done_batch]
for x in memory_batch:
    print(x)

In [None]:
equal = True
for model_batch_el, memory_batch_el in zip(model_batch, memory_batch):
    if torch.any(model_batch_el != memory_batch_el):
        print(model_batch_el)
        print(memory_batch_el)
        print(model_batch_el != memory_batch_el)
        equal = False
if equal: 
    print("Memory Equivalent")

In [None]:
# DQN Notebook 
GAMMA = model.discount
online_Q = model.net(state_batch, "online")
online_values = online_Q.gather(1, action_batch) # Q(s, a)
online_next_Q = model.net(next_state_batch, "online")
best_action = torch.argmax(online_next_Q, axis = 1).unsqueeze(1) # a' for Q(s', -)
bootstrap_Q = model.net(next_state_batch, "target")
bootstrap_values = bootstrap_Q.gather(1, best_action) 
state_action_values = online_values
expected_state_action_values = (bootstrap_values * (1 - done_batch.float()) * GAMMA) + reward_batch
td_est = state_action_values
td_target = expected_state_action_values
criterion = nn.SmoothL1Loss()
loss = criterion(state_action_values, expected_state_action_values)

print(f"{td_est}")
print(f"{td_target}")
print(f"{loss}")

In [None]:
# Model TDs
td_est_model = model.td_estimate(state_batch, action_batch)
td_target_model = model.td_target(reward_batch, next_state_batch, done_batch)
loss_model = model.loss_fn(td_est, td_target)
print(f"{td_est_model}")
print(f"{td_target_model}")
print(f"{loss_model}")

In [None]:
pairs = [
    (td_est, td_est_model),
    (td_target, td_target_model),
    (loss, loss_model)
]
equal = True
for nb, m in pairs:
    if torch.any(nb != m):
        print(nb)
        print(m)
        print(nb != m)
        equal = False
if equal is True: 
    print("Values Equivalent")
    
# Check to see if syncing is incorrect. -- it was not. 
# Check if target network is being updated in optimize step 