In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

## Simple feature-based reward function

In [2]:
class RewardFunction(nn.Module):
    def __init__(self, state_size):
        super(RewardFunction, self).__init__()
        self.fc = nn.Linear(state_size, 1)

    def forward(self, x):
        return self.fc(x)

In [3]:
# Expert trajectory (states and corresponding actions)
expert_states = np.random.randn(1000, 10)  # 1000 states with 10 features
expert_actions = np.random.randn(1000, 2)  # 1000 actions

# Convert to PyTorch tensors
states = torch.tensor(expert_states, dtype=torch.float32)
actions = torch.tensor(expert_actions, dtype=torch.float32)

# Initialize reward function and optimizer
reward_fn = RewardFunction(state_size=10)
optimizer = optim.Adam(reward_fn.parameters(), lr=0.001)

## Train loop

In [4]:
for epoch in range(1000):
    optimizer.zero_grad()
    
    # Compute reward for each state
    rewards = reward_fn(states)
    
    # Max-Entropy IRL Loss: we want the expert's actions to be the most likely
    # In practice, this would involve a more complex RL-based optimization, 
    # but here we demonstrate a simple approach.
    loss = torch.mean(-rewards)  # This is a simplified version for demonstration purposes
    
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

# Now, the reward function has been trained to predict the expert's behavior.

Epoch 0, Loss: 0.19827845692634583
Epoch 100, Loss: 0.07856760174036026
Epoch 200, Loss: -0.04114299267530441
Epoch 300, Loss: -0.16085349023342133
Epoch 400, Loss: -0.28056421875953674
Epoch 500, Loss: -0.4002741575241089
Epoch 600, Loss: -0.5199833512306213
Epoch 700, Loss: -0.639693558216095
Epoch 800, Loss: -0.7594028115272522
Epoch 900, Loss: -0.8791123032569885


---