In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import torch
import torch.nn as nn


## LSTM model without any feature

In [2]:
class SimpleLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleLSTMModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(in_features=hidden_dim, out_features=output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(x)
        # Only take the output from the final timestep
        out = self.fc(hidden[-1])
        return out

## Mapping Items

In [3]:
# Assuming `data` is a DataFrame with your dataset
import pandas as pd

data = pd.read_csv('sessions_UK_finalized_len2.csv')


# Combine all items into a single list
all_items = pd.concat([data['next_item'], data['prev_items'].apply(pd.Series).stack()]).unique()

# Create item to index and index to item mappings
item_to_index = {item: idx for idx, item in enumerate(all_items)}
index_to_item = {idx: item for item, idx in item_to_index.items()}

vocab_size = len(item_to_index)  # The total number of unique items


## Encode Sequences

In [4]:
import pandas as pd
import ast  

# Convert string representation of lists to actual lists
data['prev_items'] = data['prev_items'].apply(ast.literal_eval)

# Function to encode a single list of item IDs
def encode_prev_items(item_list, mapping):
    return [mapping[item] for item in item_list if item in mapping]

# Apply encoding to each list in 'prev_items'
data['encoded_prev_items'] = data['prev_items'].apply(lambda x: encode_prev_items(x, item_to_index))

# Check the updated DataFrame
print(data[['next_item', 'prev_items', 'encoded_prev_items']])


        next_item                prev_items encoded_prev_items
0      B0046U4CGS  [B01NCPFGQ3, B01N0U5UDH]             [4024]
1      B07QS8M6W4  [B08HLVKVWR, B00I874D4Q]       [1779, 3370]
2      B094VMJK1G  [B094VLF9QV, B0B24Z4D3V]             [2374]
3      B07YQ1XM6J  [B0177HKHQK, B01HQHGIBY]       [2869, 5891]
4      B0B63T3HGD  [B09JXY17B6, B004PYD9QE]             [5401]
...           ...                       ...                ...
52194  B0BKPXCVHZ  [B08B4T5ZWJ, B08B4T5ZWJ]         [964, 964]
52195  B08KS94WMW  [B0BG912PV7, B09Z8C64DT]        [2629, 890]
52196  B004AVB4UK  [B0BCLGK9DD, B00VP2SSFG]         [5304, 31]
52197  B0BD96VGPW  [B08L6RD61H, B07XTNBQ8G]       [5400, 3734]
52198  B07S6C1DZ6  [B07DYBV29C, B07DYBV29C]         [212, 212]

[52199 rows x 3 columns]


## Pad Sequences

In [5]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Convert each sequence to a PyTorch tensor
sequences = [torch.tensor(seq) for seq in data['encoded_prev_items']]

# Pad all sequences to the same length
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)

print(padded_sequences.shape)


torch.Size([52199, 2])


## Data Prep

In [6]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Encode 'next_item'
data['encoded_next_item'] = data['next_item'].apply(lambda x: item_to_index[x] if x in item_to_index else None)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Now split the data again
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert sequences and targets to PyTorch tensors as before
train_sequences = pad_sequence([torch.tensor(seq) for seq in train_data['encoded_prev_items']], batch_first=True)
test_sequences = pad_sequence([torch.tensor(seq) for seq in test_data['encoded_prev_items']], batch_first=True)

train_targets = torch.tensor(train_data['encoded_next_item'].values)
test_targets = torch.tensor(test_data['encoded_next_item'].values)



## Baseline Model without feature

In [7]:
import torch.nn.functional as F
class SimpleLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleLSTMModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(in_features=hidden_dim, out_features=output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(x)
        # Only take the output from the final timestep
        out = self.fc(hidden[-1])

        # Apply softmax on the output layer to get probabilities
        out = F.softmax(out, dim=1)
        return out


In [8]:
import torch.optim as optim

embedding_dim = 100
hidden_dim = 128
output_dim = vocab_size  # Same as the number of unique items
model = SimpleLSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 5  # Starting with a small number for the baseline model

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    predictions = model(train_sequences)
    loss = loss_function(predictions, train_targets)
    
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch+1}: Loss = {loss.item()}')


  from .autonotebook import tqdm as notebook_tqdm


Epoch 1: Loss = 10.320464134216309
Epoch 2: Loss = 10.320464134216309
Epoch 3: Loss = 10.320464134216309
Epoch 4: Loss = 10.320464134216309
Epoch 5: Loss = 10.320464134216309


## Model Evaluation

In [27]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Assuming test_sequences and test_targets are lists of integers
# Convert lists of sequences and targets into PyTorch tensors
test_sequences_tensor = torch.tensor(test_sequences, dtype=torch.long)
test_targets_tensor = torch.tensor(test_targets, dtype=torch.long)

# Wrap tensors in a TensorDataset
test_dataset = TensorDataset(test_sequences_tensor, test_targets_tensor)

# Create a DataLoader
batch_size = 64  # Adjust based on your needs
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


  test_sequences_tensor = torch.tensor(test_sequences, dtype=torch.long)
  test_targets_tensor = torch.tensor(test_targets, dtype=torch.long)


In [10]:
def evaluate_model(model, data_loader, k=5):
    model.eval()  # Set the model to evaluation mode
    top_k_accs = []  # Store each batch's Top-K accuracy
    
    with torch.no_grad():  # No gradients needed during evaluation
        for sequences, targets in data_loader:
            predictions = model(sequences)
            # Call the top_k_accuracy function and store its result in a differently named variable
            batch_top_k_accuracy = top_k_accuracy(predictions, targets, k=k)
            top_k_accs.append(batch_top_k_accuracy)

    # Calculate the average Top-K accuracy across all batches
    avg_top_k_acc = sum(top_k_accs) / len(top_k_accs)
    return avg_top_k_acc

# Make sure you have a function defined for calculating Top-K accuracy
def top_k_accuracy(outputs, targets, k=5):
    _, predicted = outputs.topk(k, 1, True, True)
    correct = predicted.eq(targets.view(-1, 1).expand_as(predicted))
    correct_total = correct.view(-1).float().sum(0, keepdim=True)
    return correct_total.mul_(100.0 / outputs.size(0)).item()

# Assuming test_loader is your DataLoader for the test set
avg_top5_acc = evaluate_model(model, test_loader, k=5)
print(f"Average Top-5 Accuracy on Test Set: {avg_top5_acc}%")

Average Top-5 Accuracy on Test Set: 0.009527439024390244%


In [11]:
def evaluate_with_one_hot(model, data_loader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for sequences, targets in data_loader:
            predictions = model(sequences)
            _, predicted_classes = torch.max(predictions, 1)
            correct_predictions += (predicted_classes == targets).sum().item()
            total_predictions += targets.size(0)
    
    accuracy = correct_predictions / total_predictions * 100
    return accuracy


In [12]:
# Assuming test_loader is your DataLoader for the test set
acc = evaluate_with_one_hot(model, test_loader)
print(f"Accuracy of the Test Set: {acc:.7f}%")

Accuracy of the Test Set: 0.0095785%


In [13]:
def predict_next_item(model, sequence, item_to_index, index_to_item, k=1):
    model.eval()  # Ensure the model is in evaluation mode
    
    # Process the input sequence
    encoded_sequence = [item_to_index[item] for item in sequence if item in item_to_index]
    padded_sequence = torch.tensor([encoded_sequence])  # Add a batch dimension and pad if necessary
    padded_sequence = pad_sequence(padded_sequence, batch_first=True)  # Assuming you're using pad_sequence for padding
    
    with torch.no_grad():
        predictions = model(padded_sequence)
        _, top_k_indices = predictions.topk(k, 1, True, True)
        top_k_items = [index_to_item[index.item()] for index in top_k_indices[0]]

        

    return top_k_items

# Example usage
sequence = ['B01NCPFGQ3', 'B01N0U5UDH']  # Example sequence of item IDs
predicted_items = predict_next_item(model, sequence, item_to_index, index_to_item, k=5)
print(f"Predicted next items: {predicted_items}")


Predicted next items: ["['B01BKMARL8', 'B07ZKDGFMF']", "['B09XJ1RZ4Q', 'B09YDDQ3G2']", "['B0935DN1BN', 'B0935JRJ59']", "['B08PPDTH9L', 'B08PPDTH9L']", "['B09DXW29JB', 'B0B2WCXJJV']"]


## LSTM with Price

In [14]:
products = pd.read_csv('products_train_UK_representations.csv')
# Ensure 'id' is the index for easy lookup
products = products.set_index('id')
products.head()

Unnamed: 0_level_0,price,representation
id,Unnamed: 1_level_1,Unnamed: 2_level_1
B07PNBPVN1,3.75,"[-0.21872417628765106, -0.2500726580619812, -0..."
B0B45XQ6Q3,8.59,"[-0.7130799889564514, -0.5445948839187622, -0...."
B01CR9IRWY,16.0,"[-0.8353576064109802, -0.6093693375587463, -0...."
B077RCBS9C,36.98,"[-0.8522624969482422, -0.5257986783981323, -0...."
B088BMDXPD,7.99,"[-0.8538389205932617, -0.47541171312332153, -0..."


## Associate item in sequences with its price

In [15]:
# Adding price to each item in the sequences
def add_price_to_sequence(sequence):
    # For each item in the sequence, create a tuple (item_id, price)
    # If the item ID is not in the products DataFrame, use a default price (e.g., 0 or the mean price)
    default_price = products['price'].mean()
    return [(item, products.at[item, 'price'] if item in products.index else default_price) for item in sequence]

data['items_with_price'] = data['prev_items'].apply(add_price_to_sequence)
data.head()


Unnamed: 0,next_item,prev_items,encoded_prev_items,encoded_next_item,items_with_price
0,B0046U4CGS,"[B01NCPFGQ3, B01N0U5UDH]",[4024],0,"[(B01NCPFGQ3, 8.99), (B01N0U5UDH, 9.48)]"
1,B07QS8M6W4,"[B08HLVKVWR, B00I874D4Q]","[1779, 3370]",1,"[(B08HLVKVWR, 9.17), (B00I874D4Q, 3.6)]"
2,B094VMJK1G,"[B094VLF9QV, B0B24Z4D3V]",[2374],2,"[(B094VLF9QV, 29.93), (B0B24Z4D3V, 58.91)]"
3,B07YQ1XM6J,"[B0177HKHQK, B01HQHGIBY]","[2869, 5891]",3,"[(B0177HKHQK, 4.29), (B01HQHGIBY, 3.89)]"
4,B0B63T3HGD,"[B09JXY17B6, B004PYD9QE]",[5401],4,"[(B09JXY17B6, 9.99), (B004PYD9QE, 7.99)]"


In [16]:
# Function to retrieve prices for a list of item IDs
def get_prices(items):
    # Retrieve prices for each item, defaulting to a specific price if not found (e.g., mean price)
    default_price = products['price'].mean()
    return [products.loc[item, 'price'] if item in products.index else default_price for item in items]

# Apply the function to each row in the 'prev_items' column
data['prices'] = data['prev_items'].apply(get_prices)

# View the updated DataFrame
data.head()

Unnamed: 0,next_item,prev_items,encoded_prev_items,encoded_next_item,items_with_price,prices
0,B0046U4CGS,"[B01NCPFGQ3, B01N0U5UDH]",[4024],0,"[(B01NCPFGQ3, 8.99), (B01N0U5UDH, 9.48)]","[8.99, 9.48]"
1,B07QS8M6W4,"[B08HLVKVWR, B00I874D4Q]","[1779, 3370]",1,"[(B08HLVKVWR, 9.17), (B00I874D4Q, 3.6)]","[9.17, 3.6]"
2,B094VMJK1G,"[B094VLF9QV, B0B24Z4D3V]",[2374],2,"[(B094VLF9QV, 29.93), (B0B24Z4D3V, 58.91)]","[29.93, 58.91]"
3,B07YQ1XM6J,"[B0177HKHQK, B01HQHGIBY]","[2869, 5891]",3,"[(B0177HKHQK, 4.29), (B01HQHGIBY, 3.89)]","[4.29, 3.89]"
4,B0B63T3HGD,"[B09JXY17B6, B004PYD9QE]",[5401],4,"[(B09JXY17B6, 9.99), (B004PYD9QE, 7.99)]","[9.99, 7.99]"


In [17]:
# Unique set of all item IDs
all_items = set([item for sublist in data['prev_items'] for item in sublist] + data['next_item'].tolist())

# Create mappings
item_to_index = {item: idx for idx, item in enumerate(all_items)}
index_to_item = {idx: item for item, idx in item_to_index.items()}

# Encode the sequences and next item
data['encoded_prev_items'] = data['prev_items'].apply(lambda x: [item_to_index[item] for item in x])
data['encoded_next_item'] = data['next_item'].apply(lambda x: item_to_index[x])


## LSTM with Price Model

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMWithPriceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, price_feature_dim=1):
        super(LSTMWithPriceModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        # Adjust the LSTM input size to include the price feature
        self.lstm = nn.LSTM(input_size=embedding_dim + price_feature_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(in_features=hidden_dim, out_features=output_dim)
    
    def forward(self, x, prices):
        x = self.embedding(x)
        # Concatenate item embeddings with price
        x = torch.cat((x, prices.unsqueeze(-1)), dim=-1)
        lstm_out, (hidden, cell) = self.lstm(x)
        out = self.fc(hidden[-1])
        return out


In [20]:
import torch
from torch.utils.data import Dataset, DataLoader

class SequencePriceDataset(Dataset):
    def __init__(self, sequences, prices, targets):
        self.sequences = sequences
        self.prices = prices
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        sequence = torch.tensor(self.sequences[idx], dtype=torch.long)
        prices = torch.tensor(self.prices[idx], dtype=torch.float)
        target = torch.tensor(self.targets[idx], dtype=torch.long)
        return sequence, prices, target

# Extract data from DataFrame for the Dataset
sequences = data['encoded_prev_items'].tolist()
prices = data['prices'].tolist()
targets = data['encoded_next_item'].tolist()

# Create dataset and data loader
dataset = SequencePriceDataset(sequences, prices, targets)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [22]:
import torch.nn as nn

class LSTMWithPriceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMWithPriceModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim + 1, hidden_size=hidden_dim, batch_first=True)  # +1 for the price
        self.fc = nn.Linear(in_features=hidden_dim, out_features=output_dim)

    def forward(self, sequences, prices):
        embedded = self.embedding(sequences)
        # Combine prices with embeddings
        combined = torch.cat((embedded, prices.unsqueeze(-1)), dim=2)  # Ensure prices are correctly shaped
        lstm_out, _ = self.lstm(combined)
        output = self.fc(lstm_out[:, -1])  # Get the output of the last time step
        return output


In [23]:
import torch.optim as optim

# Assuming vocab_size and other parameters are set
model = LSTMWithPriceModel(vocab_size=len(item_to_index), embedding_dim=100, hidden_dim=128, output_dim=len(item_to_index))
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(10):  # Number of epochs
    model.train()
    total_loss = 0
    for sequences, prices, targets in data_loader:
        optimizer.zero_grad()
        outputs = model(sequences, prices)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')


Epoch 1, Loss: 7.512721206043281
Epoch 2, Loss: 5.046496960900578
Epoch 3, Loss: 3.9015308678442358
Epoch 4, Loss: 3.227950044706756
Epoch 5, Loss: 2.7641912140682634
Epoch 6, Loss: 2.4248918405204427
Epoch 7, Loss: 2.1720372526084675
Epoch 8, Loss: 1.9835269509577285
Epoch 9, Loss: 1.8425508270807125
Epoch 10, Loss: 1.7341615022981869


In [24]:
# Calculate the average loss over the dataset
total_loss = 0
count = 0
for sequences, prices, targets in data_loader:
    outputs = model(sequences, prices)
    loss = criterion(outputs, targets)
    total_loss += loss.item()
    count += 1
average_loss = total_loss / count
print(f"Average Loss: {average_loss}")

Average Loss: 1.3981399623421478


## Top k accuracy

In [25]:
def top_k_accuracy(outputs, targets, k=5):
    with torch.no_grad():
        _, predicted = outputs.topk(k, dim=1)
        correct = predicted.eq(targets.view(-1, 1).expand_as(predicted))
        correct_total = correct.view(-1).float().sum(0, keepdim=True)
        return correct_total.mul_(100.0 / outputs.size(0)).item()

# Example usage within an evaluation loop:
model.eval()
top_k_accs = []
for sequences, prices, targets in data_loader:
    outputs = model(sequences, prices)
    top_k_acc = top_k_accuracy(outputs, targets, k=5)
    top_k_accs.append(top_k_acc)
average_top_k_accuracy = sum(top_k_accs) / len(top_k_accs)
print(f"Average Top-5 Accuracy: {average_top_k_accuracy}%")


Average Top-5 Accuracy: 90.15477284730649%


## Accuracy

In [26]:
def calculate_accuracy(outputs, targets):
    with torch.no_grad():
        _, predicted = torch.max(outputs, dim=1)
        correct = predicted.eq(targets).sum().item()
        accuracy = 100 * correct / targets.size(0)
    return accuracy

# Set the model to evaluation mode
model.eval()

# List to hold accuracies for each batch
accuracies = []

# Evaluate over the entire DataLoader
for sequences, prices, targets in data_loader:
    # Forward pass
    outputs = model(sequences, prices)

    # Calculate accuracy
    accuracy = calculate_accuracy(outputs, targets)

    # Store the accuracy
    accuracies.append(accuracy)

# Calculate the average accuracy across all batches
average_accuracy = sum(accuracies) / len(accuracies)
print(f"Average Accuracy: {average_accuracy:.2f}%")

Average Accuracy: 60.90%
