In [1]:
import pickle as pkl
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import gym 

from decision_transformer.models.encoder import get_encoder
from decision_transformer.envs.carracing import CarRacing
from decision_transformer.models.gcsl_idm_org import GCSL_IDM
# dotdict is a dictionary that allows to access its keys as attributes
class dotdict(dict):
    def __getattr__(self, name):
        return self[name]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
args = dotdict({
    'device': 'cuda',
    'K': 20,
})

In [3]:
# load data/carracing-medium-v2.pkl
with open('data/carracing-medium-v2.pkl', 'rb') as f:
    data = pkl.load(f)
    print(data[0].keys())

dict_keys(['observations', 'actions', 'rewards', 'dones'])


In [4]:
state_mean = np.mean([np.mean(traj['observations'], axis=0) for traj in data.values()], axis=0)
state_std = np.mean([np.std(traj['observations'], axis=0) for traj in data.values()], axis=0)


In [5]:
state_mean_cp, state_std_cp = np.load('carracing_v1_ME_state_mean_std.npz').values()

In [6]:
data[0]['observations'].shape, data[0]['actions'].shape, data[0]['rewards'].shape, data[0]['dones'].shape

((763, 64), (763, 2), (763,), (763,))

In [7]:
class Predictor(nn.Module):
    def __init__(self, input_dim, output_dim, use_tanh=False):
        super(Predictor, self).__init__()
        self.state_embed = nn.Linear(input_dim, 128)
        self.reward_embed = nn.Linear(1, 128)
        self.use_tanh = use_tanh

        self.predict_state = nn.Sequential(nn.Linear(256, 400),
                                            nn.ReLU(),
                                            nn.Linear(400, 300),
                                            nn.ReLU(),
                                            nn.Linear(300, output_dim))

    def forward(self, state, reward):
        state = self.state_embed(state)
        reward = self.reward_embed(reward.unsqueeze(1))
        state_reward = torch.cat((state, reward), dim=1)
        if self.use_tanh:
            return torch.tanh(self.predict_state(state_reward))
        return self.predict_state(state_reward)
    
    def get_action(self, states, actions, rewards, target_returns, dones):
        print(states.shape, actions.shape, rewards.shape, target_returns.shape, dones.shape)
        state = states[-1].unsqueeze(0)
        reward = target_returns[:,-1].unsqueeze(0)
        print(state.shape, reward.shape)
        state = self.state_embed(state)
        reward = self.reward_embed(reward)
        state_reward = torch.cat((state, reward), dim=1)
        return self.predict_state(state_reward)[0]

In [15]:
state_predictor = Predictor(64, 64)
state_predictor.load_state_dict(torch.load('state_predictor_carracer.pt'))
action_predictor = Predictor(64, 2, use_tanh=True)
action_predictor.load_state_dict(torch.load('action_predictor_carracer.pt'))
encoder = get_encoder('decision_transformer/models/conv_net_500_v10.pt')

gcsl_predictor = GCSL_IDM(64, 2, 1000, args)
gcsl_predictor.load_state_dict(torch.load('tmp\\carracing\\idm_gcsl_gcsl+\\2023_05_12_16_29_53_291165\\tensorboard\\iter_100.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [9]:
def evaluate_episode_rtg(
        env,
        state_dim,
        act_dim,
        model,
        max_ep_len=1000,
        scale=1000.,
        state_mean=0.,
        state_std=1.,
        device='cuda',
        encoder=None,
        target_return=None,
        mode='normal',
    ):

    model.eval()
    model.to(device=device)

    state_mean = torch.from_numpy(state_mean).to(device=device)
    state_std = torch.from_numpy(state_std).to(device=device)

    state = env.reset()
    if encoder is not None:
        state = torch.tensor(state, device=device, dtype=torch.float32).unsqueeze(0)
        state = encoder(state).squeeze(0).detach().cpu().numpy()

    if mode == 'noise':
        state = state + np.random.normal(0, 0.1, size=state.shape)

    # we keep all the histories on the device
    # note that the latest action and reward will be "padding"
    states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
    actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
    rewards = torch.zeros(0, device=device, dtype=torch.float32)

    ep_return = target_return
    target_return = torch.tensor(ep_return, device=device, dtype=torch.float32).reshape(1, 1)
    timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)

    sim_states = []

    episode_return, episode_length = 0, 0
    for t in range(max_ep_len):

        # add padding
        actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
        rewards = torch.cat([rewards, torch.zeros(1, device=device)])

        action = model.get_action(
            (states.to(dtype=torch.float32) - state_mean) / state_std,
            actions.to(dtype=torch.float32),
            rewards.to(dtype=torch.float32),
            target_return.to(dtype=torch.float32),
            timesteps.to(dtype=torch.long),
        )
        actions[-1] = action
        action = action.detach().cpu().numpy()

        env.render()
        state, reward, done, _ = env.step(action)
        if encoder is not None:
            state = torch.tensor(state, device=device, dtype=torch.float32).unsqueeze(0)
            state = encoder(state).squeeze(0).detach().cpu().numpy()

        cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
        states = torch.cat([states, cur_state], dim=0)
        rewards[-1] = reward

        if mode != 'delayed':
            pred_return = target_return[0,-1] - (reward/scale)
        else:
            pred_return = target_return[0,-1]
        target_return = torch.cat(
            [target_return, pred_return.reshape(1, 1)], dim=1)
        timesteps = torch.cat(
            [timesteps,
             torch.ones((1, 1), device=device, dtype=torch.long) * (t+1)], dim=1)

        episode_return += reward
        episode_length += 1

        if done:
            break

    data = {
        'states': states,
        'actions': actions,
        'target_return': torch.full_like(target_return, episode_return/scale), # actual return 
        'timesteps': timesteps,
    }

    return episode_return, episode_length, data

In [10]:
env = CarRacing()

In [17]:
episode_return, episode_length, data = evaluate_episode_rtg(
    env, 64, 2, gcsl_predictor, encoder=encoder, target_return=275/100., mode='delayed', scale=100., device='cpu',
    state_mean=state_mean_cp, state_std=state_std_cp,
    # state_mean=np.zeros_like(state_mean_cp), state_std=np.ones_like(state_std_cp),
)

Episode 4
Reward 873.5612167300249
Track generation: 1115..1398 -> 283-tiles track


In [12]:
episode_return

30.2000000000003

In [None]:
class RewardEngine(nn.Module):
    def __init__(self, grid_size=10, dim=64):
        super().__init__()
        self.grid_size = grid_size
        self.dim = dim
        args = dotdict({
            'device': 'cuda',
            'K': 20,
        })
        self.gcsl_predictor = GCSL_IDM(dim, 2, 1000, args)
        self.gcsl_predictor.load_state_dict(torch.load('tmp\\carracing\\idm_gcsl_gcsl+\\2023_05_12_16_29_53_291165\\tensorboard\\iter_100.pt', map_location=torch.device('cpu')))

    def get_reward(self, state, next_state, dones):
        state = torch.tensor(state, dtype=torch.float32)/10 # important to normalize
        next_state = torch.tensor(next_state, dtype=torch.float32)# important to not normalize or make sure is same as preds    
        predicted_state = self.state_predictor(state)
        pred_x = predicted_state[:,:self.grid_size].argmax(dim=1)
        pred_y = predicted_state[:, self.grid_size:].argmax(dim=1)
        pred_state = torch.stack((pred_x, pred_y), dim=1).float()
        next_state = next_state[:, :2]
        # print(pred_state, next_state[:, :2]) 
        reward = F.mse_loss(pred_state, next_state, reduction='none').sum(dim=1)
        reward[dones] = 0 # don't penalize for terminal states

        return -1 * reward