In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

In [None]:
# •	User Profiles: Gather user demographic information, preferences, and behavior data.
# •	Item Features: Collect detailed attributes of the items (e.g., products, articles) you want to recommend.
# •	Interaction History: Compile historical interaction data between users and items, including clicks, purchases, ratings, etc.

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Number of users and items
num_users = 100
num_items = 50

# Feature dimensions
user_feature_dim = 5
item_feature_dim = 5

# Generate user profiles
user_ids = np.arange(num_users)
user_features_array = np.random.randn(num_users, user_feature_dim)
user_profiles_df = pd.DataFrame(user_features_array, columns=[f'user_feature_{i}' for i in range(user_feature_dim)])
user_profiles_df.insert(0, 'user_id', user_ids)

# Adjust item IDs to start from 1 to handle padding index 0
item_ids = np.arange(1, num_items + 1)  # Now item_ids from 1 to num_items
num_items = num_items  # No need to adjust num_items
item_features_array = np.random.randn(num_items, item_feature_dim)
# Add padding row at index 0
padding_item_feature = np.zeros((1, item_feature_dim))
item_features_array = np.vstack([padding_item_feature, item_features_array])
item_features_df = pd.DataFrame(item_features_array, columns=[f'item_feature_{i}' for i in range(item_feature_dim)])
item_ids_with_padding = np.arange(0, num_items + 1)  # From 0 to num_items, where 0 is padding index
item_features_df.insert(0, 'item_id', item_ids_with_padding)

# Generate interaction history with is_click interaction
interaction_records = []
for user_id in user_ids:
    interacted_items = np.random.choice(item_ids[1:], size=np.random.randint(1, 10), replace=False)
    is_clicks = np.random.choice([0, 1], size=len(interacted_items))  # Random is_click values
    for item_id, is_click in zip(interacted_items, is_clicks):
        interaction_records.append({'user_id': user_id, 'item_id': item_id, 'interaction': 1, 'is_click': is_click})
interaction_history_df = pd.DataFrame(interaction_records)

# Convert user profiles to tensors
user_ids_tensor = torch.tensor(user_profiles_df['user_id'].values, dtype=torch.long)
user_features_tensor = torch.tensor(user_profiles_df.drop('user_id', axis=1).values, dtype=torch.float)

# Convert item features to tensors
item_ids_tensor = torch.tensor(item_features_df['item_id'].values, dtype=torch.long)
item_features_tensor = torch.tensor(item_features_df.drop('item_id', axis=1).values, dtype=torch.float)

In [4]:
# print("User Profiles:")
# print(user_profiles_df.head())

# print("\nItem Features:")
# print(item_features_df.head())

print("\nInteraction History:")
print(interaction_history_df.head())


Interaction History:
   user_id  item_id  interaction  is_click
0        0       14            1         1
1        0        5            1         1
2        0       24            1         1
3        0       44            1         1
4        0       31            1         0


In [5]:
# Convert user profiles to tensors
user_ids_tensor = torch.tensor(user_profiles_df['user_id'].values, dtype=torch.long)
user_features_tensor = torch.tensor(user_profiles_df.drop('user_id', axis=1).values, dtype=torch.float)

# Convert item features to tensors
item_ids_tensor = torch.tensor(item_features_df['item_id'].values, dtype=torch.long)
item_features_tensor = torch.tensor(item_features_df.drop('item_id', axis=1).values, dtype=torch.float)

In [6]:
user_features_tensor[1]

tensor([-0.2341,  1.5792,  0.7674, -0.4695,  0.5426])

In [7]:
# Define the environment
class DataEnvironment:
    def __init__(self, user_features, item_features, interaction_history, slate_size, device):
        self.user_features = user_features  # Tensor of shape (num_users, user_feature_dim)
        self.item_features = item_features  # Tensor of shape (num_items + 1, item_feature_dim) including padding
        self.interaction_history = interaction_history
        self.slate_size = slate_size
        self.device = device

        self.num_users = user_features.shape[0]
        self.num_items = item_features.shape[0] - 1  # Exclude padding index
        self.user_feature_dim = user_features.shape[1]
        self.item_feature_dim = item_features.shape[1]
        self.candidate_items = torch.arange(1, self.num_items + 1).to(device)  # From 1 to num_items

    def reset(self, batch_size):
        # Randomly select users
        user_indices = np.random.choice(self.num_users, batch_size, replace=False)
        user_profiles = self.user_features[user_indices].to(self.device)

        # Prepare user history
        user_histories = []
        history_lengths = []
        max_history_length = 10  # Set a maximum history length
        for user_id in user_indices:
            # Get user's interaction history
            user_history = self.interaction_history[self.interaction_history['user_id'] == user_id]
            # Get item_ids and is_click
            history_items = user_history['item_id'].values
            is_clicks = user_history['is_click'].values
            # Limit to max_history_length
            if len(history_items) > max_history_length:
                history_items = history_items[:max_history_length]
                is_clicks = is_clicks[:max_history_length]
            history_length = len(history_items)
            history_lengths.append(history_length)
            # Pad history to max_history_length with padding index 0
            padded_history_items = np.pad(history_items, (0, max_history_length - history_length), 'constant', constant_values=0)
            padded_is_clicks = np.pad(is_clicks, (0, max_history_length - history_length), 'constant', constant_values=0)
            # Store as tensors
            user_histories.append((padded_history_items, padded_is_clicks))

        # Convert user_histories to tensors
        history_items_tensor = torch.tensor([h[0] for h in user_histories], dtype=torch.long).to(self.device)
        is_clicks_tensor = torch.tensor([h[1] for h in user_histories], dtype=torch.long).to(self.device)
        history_lengths_tensor = torch.tensor(history_lengths, dtype=torch.long).to(self.device)

        observation = {
            'user_profile': {
                'user_id': torch.tensor(user_indices, dtype=torch.long).to(self.device),
                'uf_embedding': user_profiles,
            },
            'user_history': {
                'history_items': history_items_tensor,  # Shape: (batch_size, max_history_length)
                'is_clicks': is_clicks_tensor,          # Shape: (batch_size, max_history_length)
                'history_length': history_lengths_tensor,
            },
        }
        return observation

    def get_candidate_info(self):
        candidates = {
            'item_id': self.candidate_items.view(1, -1),
            'item_embedding': self.item_features[1:].view(1, self.num_items, -1).to(self.device),
        }
        return candidates

    def step(self, user_ids, action):
        # Simulate user feedback based on interaction history
        batch_size = action.shape[0]
        reward = torch.zeros(batch_size).to(self.device)
        for i in range(batch_size):
            user_id = user_ids[i].item()
            recommended_items = action[i].cpu().numpy()
            # Get items the user has interacted with
            user_history = self.interaction_history[self.interaction_history['user_id'] == user_id]
            interacted_items = user_history['item_id'].values
            # Reward is the fraction of recommended items the user has interacted with
            if len(interacted_items) > 0:
                reward[i] = np.isin(recommended_items, interacted_items).mean()
            else:
                reward[i] = 0.0  # No interactions for this user
        user_feedback = {
            'reward': reward,
            'immediate_response': (reward.unsqueeze(1).repeat(1, self.slate_size) > 0).unsqueeze(-1).float(),
        }
        return user_feedback


In [None]:
# •	Daily Recommendations: In a real application, a “step” could correspond to generating recommendations for users each day.
# •	Session-Based Interactions: If users interact with the system multiple times in a session, each “step” could simulate a new interaction within the session.
# •	Time Slots: For systems that update recommendations periodically (e.g., every hour), each “step” could represent a time slot.
# Time --->

# | Step 1 | Step 2 | Step 3 | ... | Step N |

# At each step:

# - Select users
# - Generate recommendations
# - Simulate user feedback
# - Collect and analyze data

In [None]:
# 2. Simulating User Behavior
# We’ll create a function that simulates user interactions with the environment over multiple steps.
def simulate_user_interactions(env, num_steps, batch_size):
    # Initialize lists to store simulation results
    all_user_ids = []
    all_recommended_slates = []
    all_rewards = []
    all_user_histories = []

    for step in range(num_steps):
        # Reset environment and get observation
        observation = env.reset(batch_size)
        user_ids = observation['user_profile']['user_id']
        user_profiles = observation['user_profile']['uf_embedding']
        user_history = observation['user_history']

        # Get candidate items
        candidates = env.get_candidate_info()

        # For simplicity, we'll generate random recommendations (in a real scenario, we'd use a policy)
        # In Practice: A trained policy (e.g., SlateGFN_DB) would generate these recommendations based on user profiles and histories.

        recommended_slates = torch.randint(1, env.num_items + 1, (batch_size, env.slate_size)).to(env.device)

        # Simulate user feedback
        # env.step(user_ids, recommended_slates) simulates how users react to the recommendations
        user_feedback = env.step(user_ids, recommended_slates)
        rewards = user_feedback['reward']

        # Collect results
        all_user_ids.extend(user_ids.cpu().numpy())
        all_recommended_slates.extend(recommended_slates.cpu().numpy())
        all_rewards.extend(rewards.cpu().numpy())
        all_user_histories.append(user_history)

        # Print step results
        print(f"Step {step + 1}/{num_steps}")
        for i in range(batch_size):
            print(f"User ID: {user_ids[i].item()}")
            print(f"Recommended Slate: {recommended_slates[i].cpu().numpy()}")
            print(f"Reward: {rewards[i].item():.2f}")
            print(f"User History Items: {user_history['history_items'][i].cpu().numpy()}")
            print(f"User History Clicks: {user_history['is_clicks'][i].cpu().numpy()}")
            print("-" * 30)
        print("=" * 50)

    return {
        'user_ids': all_user_ids,
        'recommended_slates': all_recommended_slates,
        'rewards': all_rewards,
        'user_histories': all_user_histories
    }

#  3. Running the Simulation
# Simulation parameters
num_steps = 3
batch_size = 2  # For simplicity, we'll use a small batch size
slate_size = 5
# Initialize the environment
device = 'cpu'
env = DataEnvironment(user_features_tensor, item_features_tensor, interaction_history_df, slate_size, device)

simulation_results = simulate_user_interactions(env, num_steps, batch_size)

Step 1/3
User ID: 88
Recommended Slate: [43 18 27 15 27]
Reward: 0.20
User History Items: [22 43 29 13 24 14  3 46  7  0]
User History Clicks: [0 0 0 0 0 0 1 1 0 0]
------------------------------
User ID: 78
Recommended Slate: [36 21 25  1 14]
Reward: 0.20
User History Items: [14 22 35  3  9 13 15  0  0  0]
User History Clicks: [0 0 0 1 0 0 0 0 0 0]
------------------------------
Step 2/3
User ID: 23
Recommended Slate: [29 15 11  5 32]
Reward: 0.00
User History Items: [48  7 14  2  0  0  0  0  0  0]
User History Clicks: [1 1 1 1 0 0 0 0 0 0]
------------------------------
User ID: 7
Recommended Slate: [23 16 46 18  7]
Reward: 0.00
User History Items: [31 24  0  0  0  0  0  0  0  0]
User History Clicks: [1 1 0 0 0 0 0 0 0 0]
------------------------------
Step 3/3
User ID: 30
Recommended Slate: [50 27 24 12 50]
Reward: 0.20
User History Items: [37  6 47 12 46 34  7 44 38  0]
User History Clicks: [0 0 0 0 1 1 1 1 1 0]
------------------------------
User ID: 18
Recommended Slate: [14 42 2

  history_items_tensor = torch.tensor([h[0] for h in user_histories], dtype=torch.long).to(self.device)


In [9]:
#  Simple Policy for Recommendations

def simple_policy(user_history, num_items, slate_size):
    # Recommend items that the user has interacted with but not clicked
    batch_size = user_history['history_items'].shape[0]
    recommended_slates = torch.zeros(batch_size, slate_size, dtype=torch.long)
    for i in range(batch_size):
        history_items = user_history['history_items'][i].cpu().numpy()
        is_clicks = user_history['is_clicks'][i].cpu().numpy()
        # Get items the user interacted with but did not click
        candidate_items = history_items[(history_items != 0) & (is_clicks == 0)]
        if len(candidate_items) >= slate_size:
            recommended_items = candidate_items[:slate_size]
        else:
            # Fill the rest with random items
            num_random_items = slate_size - len(candidate_items)
            random_items = np.random.choice(np.setdiff1d(np.arange(1, num_items + 1), history_items), num_random_items, replace=False)
            recommended_items = np.concatenate([candidate_items, random_items])
        recommended_slates[i] = torch.tensor(recommended_items)
    return recommended_slates.to(user_history['history_items'].device)

def simulate_with_policy(env, num_steps, batch_size):
    for step in range(num_steps):
        # Reset environment and get observation
        observation = env.reset(batch_size)
        user_ids = observation['user_profile']['user_id']
        user_profiles = observation['user_profile']['uf_embedding']
        user_history = observation['user_history']

        # Get candidate items
        candidates = env.get_candidate_info()

        # Generate recommendations using the simple policy
        recommended_slates = simple_policy(user_history, env.num_items, env.slate_size)

        # Simulate user feedback
        user_feedback = env.step(user_ids, recommended_slates)
        rewards = user_feedback['reward']

        # Print step results
        print(f"Step {step + 1}/{num_steps}")
        for i in range(batch_size):
            print(f"User ID: {user_ids[i].item()}")
            print(f"Recommended Slate: {recommended_slates[i].cpu().numpy()}")
            print(f"Reward: {rewards[i].item():.2f}")
            print(f"User History Items: {user_history['history_items'][i].cpu().numpy()}")
            print(f"User History Clicks: {user_history['is_clicks'][i].cpu().numpy()}")
            print("-" * 30)
        print("=" * 50)

# •	The simple_policy function recommends items that the user has interacted with but did not click.
# •	If there are not enough such items, it fills the slate with random items not in the user’s history.
# •	This policy aims to re-engage users with items they showed interest in but did not click on.
simulate_with_policy(env, num_steps=2, batch_size=2)        

Step 1/2
User ID: 4
Recommended Slate: [38  8 37 34  5]
Reward: 1.00
User History Items: [38  8 37 34 45 47  5 11 21  0]
User History Clicks: [0 0 0 0 1 1 0 0 1 0]
------------------------------
User ID: 73
Recommended Slate: [48  8  4 23 27]
Reward: 0.40
User History Items: [48  8 26 39  0  0  0  0  0  0]
User History Clicks: [0 0 1 1 0 0 0 0 0 0]
------------------------------
Step 2/2
User ID: 69
Recommended Slate: [22  6 28 42 33]
Reward: 1.00
User History Items: [22  6 28 42 33  0  0  0  0  0]
User History Clicks: [0 0 0 0 0 0 0 0 0 0]
------------------------------
User ID: 59
Recommended Slate: [26  3 33  7 10]
Reward: 1.00
User History Items: [26  3 33  7 18 10  0  0  0  0]
User History Clicks: [0 0 0 0 1 0 0 0 0 0]
------------------------------


In [20]:
# Define the policy
class SlateGFN_DB(nn.Module):
    def __init__(self, state_dim, enc_dim, slate_size, num_items, device):
        super(SlateGFN_DB, self).__init__()
        self.enc_dim = enc_dim
        self.slate_size = slate_size
        self.num_items = num_items
        self.device = device

        # Item embedding layer (including padding index 0)
        self.item_embedding_layer = nn.Embedding(num_items + 1, enc_dim, padding_idx=0)

        # Adjust state_dim to include history embedding
        self.state_dim = state_dim + enc_dim

        # Forward probability network
        self.pForwardEncoder = nn.Sequential(
            nn.Linear(self.state_dim + self.enc_dim * self.slate_size, 128),
            nn.ReLU(),
            nn.Linear(128, self.enc_dim),
            nn.LayerNorm(self.enc_dim)
        )

        # Flow network
        self.logFlow = nn.Sequential(
            nn.Linear(self.state_dim + self.enc_dim * self.slate_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

        self.gfn_forward_offset = 1.0
        self.gfn_reward_smooth = 1.0
        self.gfn_Z = 0.0
        self.l2_coef = 1e-4

    def encode_user_history(self, history_items, is_clicks):
        # history_items: Tensor of shape (batch_size, max_history_length)
        # is_clicks: Tensor of shape (batch_size, max_history_length)
        # Embed the items
        item_embeddings = self.item_embedding_layer(history_items)  # Shape: (batch_size, max_history_length, enc_dim)
        # Multiply embeddings by is_clicks
        is_clicks = is_clicks.float().unsqueeze(-1)  # Shape: (batch_size, max_history_length, 1)
        weighted_embeddings = item_embeddings * is_clicks  # Shape: (batch_size, max_history_length, enc_dim)
        # Sum over history length
        history_embedding = weighted_embeddings.sum(dim=1)  # Shape: (batch_size, enc_dim)
        return history_embedding  # Shape: (batch_size, enc_dim)

    def forward(self, user_state, candidates, parent_slate, user_history, is_train=True):
        history_items = user_history['history_items']  # Shape: (batch_size, max_history_length)
        is_clicks = user_history['is_clicks']          # Shape: (batch_size, max_history_length)
        history_embedding = self.encode_user_history(history_items, is_clicks)  # Shape: (batch_size, enc_dim)

        # Update user_state to include history_embedding
        user_state = torch.cat([user_state, history_embedding], dim=1)  # Shape: (batch_size, state_dim + enc_dim)

        B = user_state.shape[0]
        candidate_item_enc = candidates['item_embedding'].squeeze(0)  # Shape: (num_items, enc_dim)

        current_P = torch.zeros(B, self.slate_size).to(self.device)
        current_action = torch.zeros(B, self.slate_size, dtype=torch.long).to(self.device)
        current_list_emb = torch.zeros(B, self.slate_size, self.enc_dim).to(self.device)
        current_flow = torch.zeros(B, self.slate_size + 1).to(self.device)

        for i in range(self.slate_size):
            current_state = torch.cat((user_state, current_list_emb.view(B, -1)), dim=1)
            selection_weight = self.pForwardEncoder(current_state)
            score = torch.matmul(selection_weight, candidate_item_enc.t())  # Shape: (B, num_items)
            prob = torch.softmax(score, dim=1)

            if is_train:
                action_at_i = parent_slate[:, i]
                current_P[:, i] = prob[torch.arange(B), action_at_i - 1]  # Adjust for index starting from 1
                current_list_emb[:, i, :] = candidate_item_enc[action_at_i - 1]
                current_flow[:, i] = self.logFlow(current_state).view(-1)
                current_action[:, i] = action_at_i
            else:
                # For simplicity, select the item with highest probability
                action_at_i = torch.argmax(prob, dim=1) + 1  # Adjust index to start from 1
                current_P[:, i] = prob[torch.arange(B), action_at_i - 1]
                current_list_emb[:, i, :] = candidate_item_enc[action_at_i - 1]
                current_flow[:, i] = self.logFlow(current_state).view(-1)
                current_action[:, i] = action_at_i

        # Terminal flow
        current_state = torch.cat((user_state, current_list_emb.view(B, -1)), dim=1)
        current_flow[:, -1] = self.logFlow(current_state).view(-1)
        reg = self.l2_coef * (sum(p.pow(2.0).sum() for p in self.parameters()))

        out_dict = {
            'prob': current_P,
            'action': current_action,
            'logF': current_flow,
            'reg': reg,
        }
        return out_dict

    def get_loss(self, out_dict, reward):
        parent_flow = out_dict['logF'][:, :-1]
        current_flow = out_dict['logF'][:, 1:]
        log_P = torch.log(out_dict['prob'] + self.gfn_forward_offset)

        forward_part = parent_flow + log_P + self.gfn_Z
        backward_part = current_flow

        DB_loss = torch.mean((forward_part - backward_part).pow(2))
        terminal_loss = torch.mean((current_flow[:, -1] + self.gfn_Z - torch.log(reward + self.gfn_reward_smooth)).pow(2))
        loss = DB_loss + terminal_loss + out_dict['reg']
        return loss

In [21]:
# 6. Initialize the Environment and Policy
# Set device
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
state_dim = user_feature_dim
enc_dim = item_feature_dim
slate_size = 5
batch_size = 32

user_features_tensor = user_features_tensor.to(device)
item_features_tensor = item_features_tensor.to(device)

env = DataEnvironment(user_features_tensor, item_features_tensor, interaction_history_df, slate_size, device)
policy = SlateGFN_DB(state_dim, enc_dim, slate_size, num_items, device).to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-3)

# Training loop
num_epochs = 5
num_batches = 50

In [22]:
# Training loop
num_epochs = 5
num_batches = 50

for epoch in range(num_epochs):
    epoch_loss = 0
    for _ in range(num_batches):
        # Reset environment and get observation
        observation = env.reset(batch_size)
        user_state = observation['user_profile']['uf_embedding']  # Shape: (B, state_dim)
        user_ids = observation['user_profile']['user_id']
        user_history = observation['user_history']

        # Get candidate items
        candidates = env.get_candidate_info()

        # For training, generate random slates (parent_slate)
        parent_slate = torch.randint(1, env.num_items + 1, (batch_size, slate_size)).to(device)

        # Forward pass
        out_dict = policy(user_state, candidates, parent_slate, user_history, is_train=True)

        # Get reward from environment
        user_feedback = env.step(user_ids, parent_slate)
        reward = user_feedback['reward']

        # Compute loss
        loss = policy.get_loss(out_dict, reward + 1e-6)  # Add small value to prevent log(0)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / num_batches
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

  history_items_tensor = torch.tensor([h[0] for h in user_histories], dtype=torch.long).to(self.device)


Epoch 1/5, Loss: 0.0814
Epoch 2/5, Loss: 0.0554
Epoch 3/5, Loss: 0.0496
Epoch 4/5, Loss: 0.0472
Epoch 5/5, Loss: 0.0454


In [23]:
# Evaluation
policy.eval()
with torch.no_grad():
    observation = env.reset(batch_size)
    user_state = observation['user_profile']['uf_embedding']  # Shape: (B, state_dim)
    user_ids = observation['user_profile']['user_id']
    user_history = observation['user_history']
    candidates = env.get_candidate_info()
    out_dict = policy(user_state, candidates, parent_slate=None, user_history=user_history, is_train=False)
    recommended_slate = out_dict['action']
    print("Recommended Slate for first user:", recommended_slate[0].cpu().numpy())

    # Evaluate performance
    user_feedback = env.step(user_ids, recommended_slate)
    avg_reward = user_feedback['reward'].mean().item()
    print(f"Average Reward: {avg_reward:.4f}")

Recommended Slate for first user: [34 36 36 36 36]
Average Reward: 0.0938


# Realtime

In [12]:
# 1. Simulate Real-Time User Arrivals


import time
import numpy as np


# Define the DataEnvironment class with the added method
class DataEnvironment:
    def __init__(self, user_features, item_features, interaction_history, slate_size, device):
        self.user_features = user_features  # Tensor of shape (num_users, user_feature_dim)
        self.item_features = item_features  # Tensor of shape (num_items + 1, item_feature_dim), including padding
        self.interaction_history = interaction_history  # DataFrame with 'user_id', 'item_id', 'interaction', 'is_click'
        self.slate_size = slate_size
        self.device = device

        self.num_users = user_features.shape[0]
        self.num_items = item_features.shape[0] - 1  # Exclude padding index
        self.user_feature_dim = user_features.shape[1]
        self.item_feature_dim = item_features.shape[1]
        self.candidate_items = torch.arange(1, self.num_items + 1).to(device)  # From 1 to num_items

    def get_user_observation(self, user_index):
        user_profile = self.user_features[user_index].unsqueeze(0).to(self.device)
    
        # Prepare user history
        max_history_length = 10  # Set a maximum history length
        # Get user's interaction history
        user_history = self.interaction_history[self.interaction_history['user_id'] == user_index]
        # Get item_ids and is_click
        history_items = user_history['item_id'].values
        is_clicks = user_history['is_click'].values
        # Limit to max_history_length
        if len(history_items) > max_history_length:
            history_items = history_items[:max_history_length]
            is_clicks = is_clicks[:max_history_length]
        history_length = len(history_items)
        # Pad history to max_history_length with padding index 0
        padded_history_items = np.pad(history_items, (0, max_history_length - history_length), 'constant', constant_values=0)
        padded_is_clicks = np.pad(is_clicks, (0, max_history_length - history_length), 'constant', constant_values=0)
        # Convert to tensors
        history_items_tensor = torch.tensor([padded_history_items], dtype=torch.long).to(self.device)
        is_clicks_tensor = torch.tensor([padded_is_clicks], dtype=torch.long).to(self.device)
        history_lengths_tensor = torch.tensor([history_length], dtype=torch.long).to(self.device)
    
        observation = {
            'user_profile': {
                'user_id': torch.tensor([user_index], dtype=torch.long).to(self.device),
                'uf_embedding': user_profile,
            },
            'user_history': {
                'history_items': history_items_tensor,  # Shape: (1, max_history_length)
                'is_clicks': is_clicks_tensor,          # Shape: (1, max_history_length)
                'history_length': history_lengths_tensor,
            },
        }
        return observation

    def get_candidate_info(self):
        candidates = {
            'item_id': self.candidate_items.view(1, -1),
            'item_embedding': self.item_features[1:].view(1, self.num_items, -1).to(self.device),
        }
        return candidates

    def step(self, user_ids, action):
        batch_size = action.shape[0]
        reward = torch.zeros(batch_size).to(self.device)
        for i in range(batch_size):
            user_id = user_ids[i].item()
            recommended_items = action[i].cpu().numpy()
            # Get items the user has interacted with
            user_history = self.interaction_history[self.interaction_history['user_id'] == user_id]
            interacted_items = user_history['item_id'].values
            # Reward is the fraction of recommended items the user has interacted with
            if len(interacted_items) > 0:
                reward[i] = np.isin(recommended_items, interacted_items).mean()
            else:
                reward[i] = 0.0  # No interactions for this user
        user_feedback = {
            'reward': reward,
            'immediate_response': (reward.unsqueeze(1).repeat(1, self.slate_size) > 0).unsqueeze(-1).float(),
        }
        return user_feedback


# Define the process_user function
def process_user(env, user_index):
    # Get observation for the user
    observation = env.get_user_observation(user_index)
    user_id = observation['user_profile']['user_id']
    user_profile = observation['user_profile']['uf_embedding']
    user_history = observation['user_history']
    
    # Get candidate items
    candidates = env.get_candidate_info()
    
    # Generate recommendations (e.g., using a policy or random for this example)
    recommended_slate = np.random.choice(env.candidate_items.cpu().numpy(), env.slate_size, replace=False)
    recommended_slate = torch.tensor(recommended_slate, dtype=torch.long).unsqueeze(0).to(env.device)
    
    # Simulate user feedback
    user_feedback = env.step(user_id.unsqueeze(0), recommended_slate)
    reward = user_feedback['reward'].item()
    
    # Print user interaction details
    print(f"User ID: {user_id.item()}")
    print(f"Recommended Slate: {recommended_slate.cpu().numpy().flatten()}")
    print(f"Reward: {reward:.2f}")
    print(f"User History Items: {user_history['history_items'].cpu().numpy().flatten()}")
    print(f"User History Clicks: {user_history['is_clicks'].cpu().numpy().flatten()}")
    print("-" * 30)

# Define the simulate_user_arrivals function
def simulate_user_arrivals(env, total_duration, average_interarrival_time):
    start_time = time.time()
    next_arrival_time = start_time + np.random.exponential(average_interarrival_time)
    user_queue = []
    
    while time.time() - start_time < total_duration:
        current_time = time.time()
        if current_time >= next_arrival_time:
            # Simulate user arrival
            user_index = np.random.choice(env.num_users)
            user_queue.append(user_index)
            # Schedule next arrival
            next_arrival_time = current_time + np.random.exponential(average_interarrival_time)
        
        if user_queue:
            # Process the next user in the queue
            user_index = user_queue.pop(0)
            process_user(env, user_index)
        
        # Sleep briefly to prevent tight loop
        time.sleep(0.01)


In [13]:
# Simulation parameters
total_duration = 10  # Total simulation time in seconds
average_interarrival_time = 2  # Average time between user arrivals in seconds
slate_size = 5

# Initialize the environment
device = 'cpu'
env = DataEnvironment(user_features_tensor, item_features_tensor, interaction_history_df, slate_size, device)

# Run the real-time simulation
simulate_user_arrivals(env, total_duration, average_interarrival_time)

User ID: 29
Recommended Slate: [32 27 20 42 50]
Reward: 0.40
User History Items: [17 37 23 41 42 50 36  7  0  0]
User History Clicks: [1 0 1 0 0 1 1 1 0 0]
------------------------------
User ID: 59
Recommended Slate: [29  3 34  2 31]
Reward: 0.20
User History Items: [26  3 33  7 18 10  0  0  0  0]
User History Clicks: [0 0 0 0 1 0 0 0 0 0]
------------------------------
User ID: 55
Recommended Slate: [26 28 44  6 24]
Reward: 0.00
User History Items: [32  0  0  0  0  0  0  0  0  0]
User History Clicks: [0 0 0 0 0 0 0 0 0 0]
------------------------------
User ID: 44
Recommended Slate: [21 18 46 19 37]
Reward: 0.20
User History Items: [11 21  8 44 36 28 41  0  0  0]
User History Clicks: [0 0 1 0 0 1 0 0 0 0]
------------------------------
User ID: 48
Recommended Slate: [24 46  1 38 11]
Reward: 0.60
User History Items: [24  4 38 47 46 17 31 13 42  0]
User History Clicks: [0 0 1 0 1 1 0 1 1 0]
------------------------------
User ID: 77
Recommended Slate: [ 4  2 47 41 30]
Reward: 0.20
User