In [66]:
import numpy as np
import pandas as pd
import pickle
from sklearn.decomposition import IncrementalPCA
from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import HeteroData
from torch_geometric.nn import HeteroConv, GCNConv, SAGEConv
from torch_geometric.transforms import ToUndirected

## Load and Pre-process Data (IPCA)

In [67]:
###############################
# 1. LOAD & PREPROCESS DATA
###############################

# Paths to your data files (adjust as needed)
flavor_embedded_path = "flavor_embedded.pkl"
menu_embedded_path = "menu_embedded.pkl"
user_dataset_path = "user_dataset_updated.csv"
new_nodes_path = "new_nodes.csv"  # Comprehensive list of ingredients

# Load pickle and CSV files
with open(flavor_embedded_path, "rb") as f:
    flavor_embedded = pickle.load(f)
with open(menu_embedded_path, "rb") as f:
    menu_embedded = pickle.load(f)

user_df = pd.read_csv(user_dataset_path)
new_nodes = pd.read_csv(new_nodes_path)

# ✅ Remove dishes with missing embeddings
menu_embedded = menu_embedded[menu_embedded["Embeddings"].apply(lambda x: len(x) > 0)]

# ✅ Verify the number of dishes after cleaning
print(f"Original Dish Count: {menu_embedded.shape[0]}")
print(f"Cleaned Dish Count: {menu_embedded_cleaned.shape[0]}")

# Define the column that holds the raw embedding lists
embedding_col = "Embeddings"

# Function to aggregate embeddings via IPCA ensuring a 300-dim output
def aggregate_embeddings_ipca(embedding_list):
    if isinstance(embedding_list, list) and len(embedding_list) > 0:
        embedding_array = np.array(embedding_list)
        if embedding_array.ndim == 2 and embedding_array.shape[0] > 1:
            ipca = IncrementalPCA(n_components=1, batch_size=min(embedding_array.shape[0], 50))
            ipca_embedding = ipca.fit_transform(embedding_array.T).flatten()
            if len(ipca_embedding) < 300:
                ipca_embedding = np.pad(ipca_embedding, (0, 300 - len(ipca_embedding)), mode='constant')
            return ipca_embedding
        elif embedding_array.ndim == 2 and embedding_array.shape[0] == 1:
            return embedding_array.flatten()
    return np.zeros(300)

# Apply the aggregation to menu and flavor datasets
menu_embedded["Aggregated_Embeddings"] = menu_embedded[embedding_col].apply(aggregate_embeddings_ipca)
flavor_embedded["Aggregated_Embeddings"] = flavor_embedded[embedding_col].apply(aggregate_embeddings_ipca)

Original Dish Count: 518
Cleaned Dish Count: 518


In [68]:
# Initialize HeteroData graph
data = HeteroData()

# --- Dish Nodes ---
menu_embedded = menu_embedded.reset_index(drop=True)
dish_features = np.stack(menu_embedded["Aggregated_Embeddings"].values)
dish_id_to_index = {str(dish_id): idx for idx, dish_id in enumerate(menu_embedded["dish_id"])}

# --- Ingredient Nodes ---
new_nodes = new_nodes.reset_index(drop=True)
ingredient_to_index = {ing: idx for idx, ing in enumerate(new_nodes["name"])}

# --- User Nodes ---
user_df = user_df.reset_index(drop=True)
user_id_to_index = {str(uid): idx for idx, uid in enumerate(sorted(user_df["user_id"].unique()))}


# --- User-Dish Edge Assignment ---
user_dish_edge_src = []
user_dish_edge_dst = []

for _, row in user_df.iterrows():
    uid = str(row["user_id"])  # Ensure user_id is a string
    dish_list = []

    # Process 'picked_dishes'
    picked = str(row.get("picked_dishes", ""))
    if picked and picked.lower() != "nan":
        dish_list.extend([str(d.strip()) for d in picked.split(",")])

    # Process 'recommended_pick'
    rec = str(row.get("recommended_pick", ""))
    if rec and rec.lower() != "nan":
        dish_list.extend([str(d.strip()) for d in rec.split(",")])

    # Assign edges
    for d in dish_list:
        if d in dish_id_to_index:
            user_dish_edge_src.append(user_id_to_index[uid])  # Map user_id to index
            user_dish_edge_dst.append(dish_id_to_index[d])    # Map dish_id to index

# --- Edges: Dish <-> Ingredient ---
dish_ing_edge_src = []
dish_ing_edge_dst = []
for idx, row in menu_embedded.iterrows():
    dish_idx = idx
    ing_str = row["ingredients_mapped"]
    if pd.notna(ing_str):
        ingredients = [ing.strip().lower() for ing in ing_str.split(",")]
        for ing in ingredients:
            if ing in ingredient_to_index:
                dish_ing_edge_src.append(dish_idx)
                dish_ing_edge_dst.append(ingredient_to_index[ing])

# Convert edge lists to tensors
data["user", "interacts", "dish"].edge_index = torch.tensor([user_dish_edge_src, user_dish_edge_dst], dtype=torch.long)
data["dish", "contains", "ingredient"].edge_index = torch.tensor([dish_ing_edge_src, dish_ing_edge_dst], dtype=torch.long)

# Ensure the graph is undirected
data = ToUndirected()(data)

In [69]:
# Ensure feature sizes match the number of nodes
num_users = len(user_id_to_index)
num_dishes = len(dish_id_to_index)
num_ingredients = len(ingredient_to_index)

# Assign node features
# Compute user features as the mean of their interacted dish embeddings
user_features = torch.zeros((len(user_id_to_index), 300), dtype=torch.float)  # Placeholder

for idx, row in user_df.iterrows():
    uid = str(row["user_id"])
    dish_list = []

    picked = str(row.get("picked_dishes", ""))
    if picked and picked.lower() != "nan":
        dish_list.extend([str(d.strip()) for d in picked.split(",")])

    rec = str(row.get("recommended_pick", ""))
    if rec and rec.lower() != "nan":
        dish_list.extend([str(d.strip()) for d in rec.split(",")])

    # Convert dish embeddings from numpy to tensor
    valid_dish_embeddings = [torch.tensor(dish_features[dish_id_to_index[d]], dtype=torch.float) 
                             for d in dish_list if d in dish_id_to_index]

    if valid_dish_embeddings:
        user_features[user_id_to_index[uid]] = torch.stack(valid_dish_embeddings).mean(dim=0)

# Assign user features to graph
data["user"].x = user_features

data["dish"].x = torch.tensor(np.stack(menu_embedded["Aggregated_Embeddings"].values), dtype=torch.float)  # Dishes: precomputed embeddings
data["ingredient"].x = torch.randn((num_ingredients, 300), dtype=torch.float)  # Ingredients: randomly initialized embeddings

# Validate feature shapes
print("User Node Features Shape:", data["user"].x.shape)
print("Dish Node Features Shape:", data["dish"].x.shape)
print("Ingredient Node Features Shape:", data["ingredient"].x.shape)


User Node Features Shape: torch.Size([10, 300])
Dish Node Features Shape: torch.Size([518, 300])
Ingredient Node Features Shape: torch.Size([6653, 300])


## Checking Nodes and Data

In [70]:
# Print Debug Information Locally
print("Node Types:", data.node_types)
print("Edge Types:", data.edge_types)
print("Number of Users:", len(user_id_to_index))
print("Number of Dishes:", len(dish_id_to_index))
print("Number of Ingredients:", len(ingredient_to_index))
print("User-Dish Edges:", data["user", "interacts", "dish"].edge_index.shape[1])
print("Dish-Ingredient Edges:", data["dish", "contains", "ingredient"].edge_index.shape[1])
print("User Node Features Shape:", data["user"].x.shape)
print("Dish Node Features Shape:", data["dish"].x.shape)
print("Ingredient Node Features Shape:", data["ingredient"].x.shape)

Node Types: ['user', 'dish', 'ingredient']
Edge Types: [('user', 'interacts', 'dish'), ('dish', 'contains', 'ingredient'), ('dish', 'rev_interacts', 'user'), ('ingredient', 'rev_contains', 'dish')]
Number of Users: 10
Number of Dishes: 518
Number of Ingredients: 6653
User-Dish Edges: 600
Dish-Ingredient Edges: 2656
User Node Features Shape: torch.Size([10, 300])
Dish Node Features Shape: torch.Size([518, 300])
Ingredient Node Features Shape: torch.Size([6653, 300])


In [71]:
print(data)

HeteroData(
  user={ x=[10, 300] },
  dish={ x=[518, 300] },
  ingredient={ x=[6653, 300] },
  (user, interacts, dish)={ edge_index=[2, 600] },
  (dish, contains, ingredient)={ edge_index=[2, 2656] },
  (dish, rev_interacts, user)={ edge_index=[2, 600] },
  (ingredient, rev_contains, dish)={ edge_index=[2, 2656] }
)


In [72]:
print("Nodes in data:", data.node_types)

Nodes in data: ['user', 'dish', 'ingredient']


In [73]:
print("Edge types:", data.edge_types)

Edge types: [('user', 'interacts', 'dish'), ('dish', 'contains', 'ingredient'), ('dish', 'rev_interacts', 'user'), ('ingredient', 'rev_contains', 'dish')]


## Splitting Data for Training and Testing

In [74]:
import random

# Convert edge list to tuples for easier manipulation
edges = list(zip(user_dish_edge_src, user_dish_edge_dst))

# Step 1: Shuffle the edges randomly
random.shuffle(edges)

# Step 2: Manually split into 80-10-10
num_edges = len(edges)
train_split = int(0.8 * num_edges)
val_split = int(0.9 * num_edges)  # 80% train, 10% val, 10% test

train_edges = edges[:train_split]
val_edges = edges[train_split:val_split]
test_edges = edges[val_split:]

# Step 3: Convert back to tensors
train_edge_index = torch.tensor(list(zip(*train_edges)), dtype=torch.long)
val_edge_index = torch.tensor(list(zip(*val_edges)), dtype=torch.long)
test_edge_index = torch.tensor(list(zip(*test_edges)), dtype=torch.long)

# Step 4: Assign edges back to separate datasets
train_data = data.clone()
train_data["user", "interacts", "dish"].edge_index = train_edge_index

val_data = data.clone()
val_data["user", "interacts", "dish"].edge_index = val_edge_index

test_data = data.clone()
test_data["user", "interacts", "dish"].edge_index = test_edge_index

# Step 5: Debug print to confirm correct split
print("✅ Train User-Dish Edges:", train_data["user", "interacts", "dish"].edge_index.shape[1])
print("✅ Validation User-Dish Edges:", val_data["user", "interacts", "dish"].edge_index.shape[1])
print("✅ Test User-Dish Edges:", test_data["user", "interacts", "dish"].edge_index.shape[1])


✅ Train User-Dish Edges: 480
✅ Validation User-Dish Edges: 60
✅ Test User-Dish Edges: 60


## Building Model

In [75]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

# ✅ Define GraphSAGE Model with LSTM Aggregation
class GraphSAGE_LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=2):
        super(GraphSAGE_LSTM, self).__init__()
        self.num_layers = num_layers

        # Define GraphSAGE layers with LSTM aggregation
        self.convs = nn.ModuleList()
        for i in range(num_layers):
            in_dim = input_dim if i == 0 else hidden_dim
            self.convs.append(SAGEConv(in_dim, hidden_dim, aggr="lstm"))  # 🔥 LSTM Aggregation

        # Final MLP for link prediction
        self.fc = nn.Linear(hidden_dim * 2, 1)  # Two node embeddings → Single score

    def forward(self, x, edge_index):
        # Pass through GraphSAGE layers
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)  # Activation function

        return x

    def decode(self, z, edge_index):
        # Get embeddings for each node in the edge pairs
        z_src = z[edge_index[0]]
        z_dst = z[edge_index[1]]

        # Concatenate both embeddings and pass through the MLP
        pred = torch.sigmoid(self.fc(torch.cat([z_src, z_dst], dim=1)))
        return pred.view(-1)

In [88]:
# ✅ Sort edge_index by destination node (column 1)
def sort_edge_index(edge_index):
    sorted_indices = edge_index[1].argsort()  # Sort by destination node
    return edge_index[:, sorted_indices]

# Apply sorting
data["user", "interacts", "dish"].edge_index = sort_edge_index(data["user", "interacts", "dish"].edge_index)

print("✅ Edge Index Sorted Successfully!")


✅ Edge Index Sorted Successfully!


## Training and Evaluate

In [76]:
# ✅ Ensure edge_index is sorted by destination nodes (second column)
def sort_edge_index(edge_index):
    sorted_indices = edge_index[1].argsort()  # Sort by destination node
    return edge_index[:, sorted_indices]

# Apply sorting to all datasets
train_data["user", "interacts", "dish"].edge_index = sort_edge_index(train_data["user", "interacts", "dish"].edge_index)
val_data["user", "interacts", "dish"].edge_index = sort_edge_index(val_data["user", "interacts", "dish"].edge_index)
test_data["user", "interacts", "dish"].edge_index = sort_edge_index(test_data["user", "interacts", "dish"].edge_index)

# Debug: Check sorting correctness
print("✅ Sorted Train Edge Index:", train_data["user", "interacts", "dish"].edge_index[:, :10])  # Print first 10 sorted edges

✅ Sorted Train Edge Index: tensor([[ 7,  1,  1,  1,  1,  3,  9,  2,  3,  2],
        [ 4,  4,  4,  4,  4,  4, 25, 25, 28, 46]])


In [77]:
from torch_geometric.loader import NeighborLoader

# ✅ Training Function for GraphSAGE
def train(model, data, optimizer, criterion, num_epochs=20):
    model.train()
    
    for epoch in range(num_epochs):
        optimizer.zero_grad()

        # Extract node features for each type
        user_x = data["user"].x
        dish_x = data["dish"].x
        ingredient_x = data["ingredient"].x

        # Combine into a single tensor (order must match edge_index)
        x = torch.cat([user_x, dish_x, ingredient_x], dim=0)

        # Forward pass
        z = model(x, data["user", "interacts", "dish"].edge_index)  # Node embeddings
        pred = model.decode(z, data["user", "interacts", "dish"].edge_index)  # Edge predictions

        # Binary labels (all edges in train_data are positive)
        labels = torch.ones(pred.shape, dtype=torch.float)

        # Compute loss
        loss = criterion(pred, labels)
        loss.backward()
        optimizer.step()

        if epoch % 5 == 0:
            print(f"Epoch {epoch} | Loss: {loss.item():.4f}")

    return model



In [78]:
# Define model, loss, and optimizer
input_dim = 300  # Same as node feature size
hidden_dim = 128  # Hidden layer size
num_layers = 2

model = GraphSAGE_LSTM(input_dim, hidden_dim, num_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss

# Train the model
model = train(model, train_data, optimizer, criterion, num_epochs=20)

Epoch 0 | Loss: 0.6793
Epoch 5 | Loss: 0.0000
Epoch 10 | Loss: 0.0000
Epoch 15 | Loss: 0.0000


In [79]:
import random
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score

# ✅ Generate negative samples
def sample_negative_edges(data, num_neg_samples=None):
    user_ids = list(range(data["user"].x.shape[0]))  # Get user indices
    dish_ids = list(range(data["dish"].x.shape[0]))  # Get dish indices
    
    pos_edges = set(zip(
        data["user", "interacts", "dish"].edge_index[0].tolist(),
        data["user", "interacts", "dish"].edge_index[1].tolist()
    ))

    neg_edges = set()
    num_neg_samples = num_neg_samples or len(pos_edges)  # Same number of negatives as positives

    while len(neg_edges) < num_neg_samples:
        u = random.choice(user_ids)
        d = random.choice(dish_ids)
        if (u, d) not in pos_edges:
            neg_edges.add((u, d))
    
    return list(neg_edges)

# ✅ Updated Evaluation Function
def evaluate(model, data):
    model.eval()

    with torch.no_grad():
        # Extract node features
        user_x = data["user"].x
        dish_x = data["dish"].x
        ingredient_x = data["ingredient"].x

        # Combine all node features into a single tensor
        x = torch.cat([user_x, dish_x, ingredient_x], dim=0)

        # Get positive edges
        pos_edges = data["user", "interacts", "dish"].edge_index

        # Get negative edges
        neg_edges = sample_negative_edges(data, num_neg_samples=pos_edges.shape[1])
        neg_edges = torch.tensor(list(zip(*neg_edges)), dtype=torch.long)

        # Combine positive & negative edges
        all_edges = torch.cat([pos_edges, neg_edges], dim=1)

        # Predict probabilities for all edges
        z = model(x, data["user", "interacts", "dish"].edge_index)
        pred = model.decode(z, all_edges)

        # Create labels (1 for positive edges, 0 for negative edges)
        labels = torch.cat([torch.ones(pos_edges.shape[1]), torch.zeros(neg_edges.shape[1])], dim=0)

        # Convert predictions to binary
        pred_binary = (pred > 0.5).float()

        # Compute metrics
        auc = roc_auc_score(labels.cpu().numpy(), pred.cpu().numpy())
        f1 = f1_score(labels.cpu().numpy(), pred_binary.cpu().numpy())

        print(f"AUC-ROC: {auc:.4f} | F1 Score: {f1:.4f}")

# ✅ Evaluate on validation set
evaluate(model, val_data)

# ✅ Evaluate on test set
evaluate(model, test_data)


AUC-ROC: 0.5000 | F1 Score: 0.6667
AUC-ROC: 0.5000 | F1 Score: 0.6667


In [80]:
print("User Feature Mean:", train_data["user"].x.mean().item())
print("Dish Feature Mean:", train_data["dish"].x.mean().item())
print("Ingredient Feature Mean:", train_data["ingredient"].x.mean().item())

User Feature Mean: 0.0005289397086016834
Dish Feature Mean: 0.00021455016394611448
Ingredient Feature Mean: -0.0004126960993744433


In [81]:
# Check variance of user, dish, and ingredient features
print("User Feature Variance:", train_data["user"].x.var().item())
print("Dish Feature Variance:", train_data["dish"].x.var().item())
print("Ingredient Feature Variance:", train_data["ingredient"].x.var().item())

# Check first 5 user embeddings
print("Sample User Embeddings:", train_data["user"].x[:5])

# Check first 5 dish embeddings
print("Sample Dish Embeddings:", train_data["dish"].x[:5])


User Feature Variance: 0.050580669194459915
Dish Feature Variance: 0.05679517984390259
Ingredient Feature Variance: 0.9975167512893677
Sample User Embeddings: tensor([[-0.3818, -0.1812, -0.0566,  ..., -0.1910, -0.1881, -0.1661],
        [-0.3129, -0.1601, -0.1108,  ..., -0.1060, -0.1099, -0.2797],
        [-0.3615, -0.2108, -0.2582,  ..., -0.1138, -0.3502, -0.0794],
        [-0.3597, -0.1699, -0.0604,  ..., -0.1998, -0.1425, -0.1612],
        [-0.4707, -0.1408, -0.1596,  ..., -0.1332, -0.1623, -0.0121]])
Sample Dish Embeddings: tensor([[-0.3295,  0.2405,  0.0800,  ..., -0.3857, -0.2247, -0.0543],
        [-0.1140, -0.0286, -0.4295,  ..., -0.0589, -0.0998, -0.2829],
        [-0.3558, -0.1527, -0.1033,  ..., -0.1148, -0.1160, -0.3739],
        [-0.5157,  0.1573,  0.2411,  ..., -0.2855, -0.0706, -0.1868],
        [-0.1648, -0.3328, -0.3008,  ..., -0.5351, -0.3513,  0.0387]])


In [90]:
# Count total dishes
total_dishes = len(dish_id_to_index)

# Count connected dishes
connected_dishes = len(set(data["user", "interacts", "dish"].edge_index[1].tolist()))

# Compute percentage
connected_percentage = (connected_dishes / total_dishes) * 100
unconnected_percentage = 100 - connected_percentage

print(f"✅ Connected Dishes: {connected_dishes}/{total_dishes} ({connected_percentage:.2f}%)")
print(f"⚠️ Unconnected Dishes: {total_dishes - connected_dishes}/{total_dishes} ({unconnected_percentage:.2f}%)")


✅ Connected Dishes: 68/518 (13.13%)
⚠️ Unconnected Dishes: 450/518 (86.87%)


In [86]:
# Check if edge_index is sorted
edge_index = data["user", "interacts", "dish"].edge_index
is_sorted = torch.all(edge_index[1][:-1] <= edge_index[1][1:])

print(f"Is Edge Index Sorted?: {is_sorted.item()}")


Is Edge Index Sorted?: False
