## Train TRPO

In [None]:
import json
from gym_env.environments import create_halfcheetah_env
from models.value import ValueNet
from models.policy import GaussianPolicyNet
from models.trpo import TRPO

env = create_halfcheetah_env()
env_dim = env.get_dim()

with open("config.json") as f:
    config = json.load(f)["HalfCheetah-v5"]

hidden_sizes = (50,50,50)
epochs = 2000

policy_net = GaussianPolicyNet(env_dim, hidden_sizes)
value_net = ValueNet(env_dim["states"], hidden_sizes)
trpo = TRPO(env_dim["actions"], policy_net, value_net, config)

trpo.train_model(env, epochs)

## Load TRPO Policy

In [None]:
from models.old_policy import CreatePolicyNet
from utils_file import *
from gym_env.environments import create_halfcheetah_env

temp_env = create_halfcheetah_env()
loaded_trpo_policy = CreatePolicyNet(temp_env)
loaded_trpo_policy.load_state_dict(torch.load((TRPO_WEIGHTS_PATH / 'policy_net_weights.pth').as_posix()))

## Generate Expert Data from TRPO policy

In [None]:
import time
seed = 1
max_num_steps = 200
num_episodes = 50000

env = create_halfcheetah_env()

trajectories = []

with tqdm(total=num_episodes) as pbar:

    for i in range(num_episodes):

        state = to_tensor(env.reset()[0])
        done = False
        trunc = False
        sum_rewards = 0
        trajectory = []

        num_steps = 0
        while not done and not trunc and num_steps < max_num_steps:

            action = policy_net.take_action(state)
            next_state, reward, done, trunc = env.take_step(state, action)
            # print(t)
            next_state = to_tensor(next_state)
            sum_rewards += reward

            trajectory.append((state, action, reward, next_state))
            state = next_state
            num_steps += 1
            # time.sleep(0.1)  # Simulate a delay
        states, actions, rewards, next_states = zip(*trajectory)

        states = torch.stack(states).squeeze(1)
        next_states = torch.stack(next_states).squeeze(1)
        actions = torch.stack(actions).squeeze(1)
        rewards = torch.as_tensor(rewards, dtype=torch.float32).unsqueeze(1)

        if (sum_rewards >= 300):
            trajectories.append(Rollout(states, actions, rewards, next_states))
            
        pbar.update(1)

In [None]:
states = torch.cat([r.states for r in trajectories], dim=0).float()
actions = torch.cat([r.actions for r in trajectories], dim=0).float()
rewards = torch.cat([r.rewards for r in trajectories], dim=0).float()
next_states = torch.cat([r.next_states for r in trajectories], dim=0).float()

torch.save(states, EXPERT_DATA_STATES_PATH.as_posix())
torch.save(actions, EXPERT_DATA_ACTIONS_PATH.as_posix())
torch.save(rewards, EXPERT_DATA_REWARDS_PATH.as_posix())
torch.save(next_states, EXPERT_DATA_NEXT_STATES_PATH.as_posix())

## Load Trajectories

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

states = torch.load(EXPERT_DATA_STATES_PATH.as_posix())
actions = torch.load(EXPERT_DATA_ACTIONS_PATH.as_posix())
rewards = torch.load(EXPERT_DATA_REWARDS_PATH.as_posix())
next_states = torch.load(EXPERT_DATA_NEXT_STATES_PATH.as_posix())

## GAIL + Diffusion

In [None]:
from models.old_policy import CreatePolicyNet
from utils_file import *
from gym_env.environments import create_halfcheetah_env

temp_env = create_halfcheetah_env()
loaded_trpo_policy = CreatePolicyNet(temp_env)
loaded_trpo_policy.load_state_dict(torch.load((TRPO_WEIGHTS_PATH / 'policy_net_weights.pth').as_posix()))

In [None]:
from models.GAIL import GAIL
import json
from datetime import datetime

time = datetime.now().strftime("%d_%m_%H_%M_%S")

seed = 42
torch.manual_seed(seed)

with open("config.json") as f:
    config = json.load(f)["HalfCheetah-v5"]

env = create_halfcheetah_env(render=False, forward_reward_weight=1)

diffusion = True
batch_size = 10 # number of rollouts
model = GAIL(env, batch_size, diffusion, config).to(device)

logs = model.train(loaded_trpo_policy)

diffusion_label = "_diff" if diffusion else ""

In [None]:
from torch import FloatTensor

real_logprob = torch.exp((-1)*FloatTensor(logs["discriminator/real_logprob"]))
fake_logprob = torch.exp((-1)*FloatTensor(logs["discriminator/fake_logprob"]))

import matplotlib.pyplot as plt
import numpy as np

def moving_average(data, window_size=1):
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

def plot_with_std(data, label, window_size=1, color='b'):
  # Calculate moving average
  average = np.convolve(data, np.ones(window_size)/window_size, mode='valid')

  # Calculate moving standard deviation
  squared_diffs = (data[window_size - 1:] - average)**2
  rolling_std = np.sqrt(np.convolve(squared_diffs, np.ones(window_size)/window_size, mode='same'))

  # Plot the data with error bars (representing standard deviation)
  plt.plot(average, label=label, color=color, linewidth=1)
  plt.fill_between(np.arange(len(average)), average - rolling_std, average + rolling_std, alpha=0.2, color=color)

# Assuming you have real_fake_logprob and fake_real_logprob arrays
plt.figure(figsize=(10, 6))  # Set the figure size

plot_with_std(real_logprob.numpy(), label='Exp Data as Fake', window_size=50, color='b')
plot_with_std(fake_logprob.numpy(), label='Gen Data as Real', window_size=50, color='g')

plt.xlabel('Epochs')  # X-axis label
plt.ylabel('Probability')  # Y-axis label
plt.title('Discriminator Probability vs Epochs')  # Title of the plot
plt.grid(True, linestyle='--', alpha=0.6)  # Add a grid for better readability
plt.legend()  # Show legend

path = f"gan_plots/loss_{time}{diffusion_label}.png"
plt.savefig(path, dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# Assuming you have real_fake_logprob and fake_real_logprob arrays
plt.figure(figsize=(10, 6))  # Set the figure size

plot_with_std(logs["policy/rewards"], label='Policy Rewards', window_size=50, color='b')

plt.axhline(y=FloatTensor(logs["expert/rewards"]).mean(), color='r', linestyle='--', label='Expert Rewards')

plt.xlabel('Epochs')  # X-axis label
plt.ylabel('Reward')  # Y-axis label
plt.title('Rewards vs Epochs')  # Title of the plot
plt.grid(True, linestyle='--', alpha=0.6)  # Add a grid for better readability
plt.legend()  # Show legend

path = f"gan_plots/reward_{time}{diffusion_label}.png"
plt.savefig(path, dpi=300, bbox_inches="tight")
plt.show()

In [None]:
import time

def try_policy(policy, render=True, max_num_steps=200, time_delay=0.1):
    policy.eval()
    env = create_halfcheetah_env(render, 1)
    ob = env.reset()[0]
    steps = 0
    ep_rwds = []
    done = False
    while not done and steps < max_num_steps:
        act = policy.take_np_action(ob)
        if render:
            env.render()
        ob, rwd, done, info, _ = env.step(act)

        ep_rwds.append(rwd)

        steps += 1
        time.sleep(time_delay)

    print(np.sum(ep_rwds))

In [None]:
try_policy(model.policy_net)

## BC GAN

In [None]:
from models.old_policy import CreatePolicyNet
from utils_file import *
from gym_env.environments import create_halfcheetah_env

temp_env = create_halfcheetah_env()
loaded_trpo_policy = CreatePolicyNet(temp_env)
loaded_trpo_policy.load_state_dict(torch.load((TRPO_WEIGHTS_PATH / 'policy_net_weights.pth').as_posix()))

In [None]:
from models.gan import BCGAN
import json

seed = 42
torch.manual_seed(seed)

with open("config.json") as f:
    config = json.load(f)["HalfCheetah-v5"]

env = create_halfcheetah_env(render=False, forward_reward_weight=1)
diffusion = True
batch_size = 10 # number of rollouts
model = BCGAN(env, batch_size, diffusion, config).to(device)

model.train(loaded_trpo_policy)


In [None]:
try_policy(model.policy_net)