### Working 

In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gym_sokoban

# Define the neural network for the latent dynamics model
class LatentDynamicsModel(nn.Module):
    def __init__(self, state_dim, latent_dim, action_dim):
        super(LatentDynamicsModel, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim)
        )
        self.transition = nn.Sequential(
            nn.Linear(latent_dim + 1, 128),  # Correct the input dimension
            nn.ReLU(),
            nn.Linear(128, latent_dim)
        )
        self.reward = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    
    def forward(self, state, action):
        latent_state = self.encoder(state)
        transition_input = torch.cat([latent_state, action], dim=-1)
        next_latent_state = self.transition(transition_input)
        reward = self.reward(next_latent_state)
        return next_latent_state, reward

# Initialize the environment and the model
env = gym.make('Sokoban-small-v1')
state_dim = np.prod(env.observation_space.shape)
action_dim = env.action_space.n
latent_dim = 64  # Example latent dimension

model = LatentDynamicsModel(state_dim, latent_dim, action_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Define the rollout method for decision-time planning
def rollout(env, model, state, action, depth=5):
    total_reward = torch.tensor(0.0, requires_grad=True)  # Ensure total_reward requires grad
    for _ in range(depth):
        state_tensor = torch.FloatTensor(state).view(1, -1)  # Ensure correct shape
        action_tensor = torch.FloatTensor([[action]])  # Ensure action is 2D
        next_latent_state, reward = model(state_tensor, action_tensor)
        state, reward, done, _ = env.step(action)
        total_reward = total_reward + reward
        if done:
            break
    return total_reward

# Training loop
for episode in range(1000):
    state = env.reset()
    done = False
    episode_reward = 0  # Track total reward for the episode
    while not done:
        action = env.action_space.sample()  # Random action for simplicity
        total_reward = rollout(env, model, state, action)
        # Optimize the model
        optimizer.zero_grad()
        loss = -total_reward  # Example loss, can be replaced with actual objective
        loss.backward()
        optimizer.step()
        state, reward, done, _ = env.step(action)
        episode_reward += reward
    print(f"Episode {episode + 1}: Total Reward = {episode_reward}")

# Save the model after training
torch.save(model.state_dict(), 'latent_dynamics_model.pth')
print("Model saved as latent_dynamics_model.pth")
