In [18]:
# === 0. Setup ===
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv, to_hetero_with_bases
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, mean_absolute_error

In [19]:
import os
import pandas as pd

# === 1. Loading and Cleaning Product Info ===
products = pd.read_csv("sephora_data/product_info.csv")
products = products.rename(columns={'product_id': 'itemID'})


In [20]:
# === 2. Loading and Combining All Reviews ===
review_files = [
    "sephora_data/reviews_0-250.csv",
    "sephora_data/reviews_250-500.csv",
    "sephora_data/reviews_500-750.csv",
    "sephora_data/reviews_750-1250.csv",
    "sephora_data/reviews_1250-end.csv"
]

reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
reviews = reviews.rename(columns={'author_id': 'userID', 'product_id': 'itemID', 'rating': 'rating'})
reviews = reviews.dropna(subset=['userID', 'itemID', 'rating'])

# TEMP: Reduce data size to avoid memory crash
reviews = reviews.sample(n=10000, random_state=42)

  reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
  reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
  reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)


In [21]:
# Converting to strings/numeric
reviews['userID'] = reviews['userID'].astype(str)
reviews['itemID'] = reviews['itemID'].astype(str)
reviews['rating'] = reviews['rating'].astype(float)

In [22]:
# === 3. Saving Preprocessed Interactions for GNN Use ===
interactions = reviews[['userID', 'itemID', 'rating']]
interactions.to_csv("interactions.csv", index=False)

In [23]:
interactions = pd.read_csv("interactions.csv")

In [25]:
# === 4. Building Graph for GNN Model ===
import torch
from torch_geometric.data import HeteroData
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# === 4.1 Loading Preprocessed Interactions and Products ===
#interactions = pd.read_csv("interactions.csv")
products = pd.read_csv("sephora_data/product_info.csv")
products = products.rename(columns={'product_id': 'itemID'})


# Keep only interactions with products that still exist
valid_item_ids = set(products['itemID'])
interactions = interactions[interactions['itemID'].isin(valid_item_ids)]

products = products[products['itemID'].isin(interactions['itemID'].unique())]  # filtering products to only include items in the sampled interactions


In [26]:
# === 4.2 Mapping IDs to Indexes ===
user_id_map = {uid: i for i, uid in enumerate(interactions['userID'].unique())}
item_id_map = {iid: i for i, iid in enumerate(interactions['itemID'].unique())}

interactions['user_idx'] = interactions['userID'].map(user_id_map)
interactions['item_idx'] = interactions['itemID'].map(item_id_map)

In [27]:
# === 4.3 Building Edge Index ===
edge_index = torch.tensor([
    interactions['user_idx'].values,
    interactions['item_idx'].values
], dtype=torch.long)

In [28]:
# === 4.4 Initializing Graph ===
data = HeteroData()
data['user'].num_nodes = len(user_id_map)
data['item'].num_nodes = len(item_id_map)
data['user'].x = torch.zeros((len(user_id_map), 32)) #to give users identity matrix as features
data['user', 'rates', 'item'].edge_index = edge_index
data['item', 'rev_rates', 'user'].edge_index = edge_index[[1, 0]]

In [29]:
# === 4.5 Building TF-IDF Features for Products ===

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import torch

# === Build 'content' field for TF-IDF ===
products['content'] = (
    products['product_name'].fillna('') + ' ' +
    products['brand_name'].fillna('') + ' ' +
    products['primary_category'].fillna('') + ' ' +
    products['secondary_category'].fillna('') + ' ' +
    products['highlights'].fillna('')  # Optional: add 'ingredients' too
)

# === TF-IDF ===
tfidf = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf.fit_transform(products['content'].astype(str)).toarray()

# === Map itemID to index in tfidf_matrix ===
product_id_to_idx = {pid: i for i, pid in enumerate(products['itemID'])}

# === Align TF-IDF matrix to itemID order in graph ===
item_features = np.zeros((len(item_id_map), tfidf_matrix.shape[1]))
for item_id, graph_idx in item_id_map.items():
    if item_id in product_id_to_idx:
        item_features[graph_idx] = tfidf_matrix[product_id_to_idx[item_id]]

# === Add to PyG HeteroData graph ===
data['item'].x = torch.tensor(item_features, dtype=torch.float)

In [30]:
print("Graph is ready:")
print(data)

Graph is ready:
HeteroData(
  user={
    num_nodes=9734,
    x=[9734, 32],
  },
  item={
    num_nodes=1494,
    x=[1494, 100],
  },
  (user, rates, item)={ edge_index=[2, 10000] },
  (item, rev_rates, user)={ edge_index=[2, 10000] }
)


In [31]:
# === 5. Define GNN Model - Since to_hetero() is broken in my setup, I manually processed each edge type. This works in any version. ===
class ManualGNN(nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.user_conv1 = SAGEConv((-1, -1), hidden_channels)
        self.item_conv1 = SAGEConv((-1, -1), hidden_channels)
        self.user_conv2 = SAGEConv((-1, -1), hidden_channels)
        self.item_conv2 = SAGEConv((-1, -1), hidden_channels)

    def forward(self, x_dict, edge_index_dict):
        # First layer
        x_user = self.user_conv1((x_dict['user'], x_dict['item']), edge_index_dict[('user', 'rates', 'item')])
        x_item = self.item_conv1((x_dict['item'], x_dict['user']), edge_index_dict[('item', 'rev_rates', 'user')])
        x_user = F.relu(x_user)
        x_item = F.relu(x_item)

        # Second layer
        x_user = self.user_conv2((x_user, x_item), edge_index_dict[('user', 'rates', 'item')])
        x_item = self.item_conv2((x_item, x_user), edge_index_dict[('item', 'rev_rates', 'user')])

        return {'user': x_user, 'item': x_item}


In [32]:
# === 6. Train/Test Split ===
train_data, test_data = train_test_split(interactions, test_size=0.2, random_state=42)

In [33]:
# i need to sample the training loop size bc my laptop crashes everytime i try to run section 7

In [None]:
# === 7. Training Loop ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

encoder = ManualGNN(hidden_channels=64)
model = encoder.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()
for epoch in range(1, 21):
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    user_emb = out['user']
    item_emb = out['item']

    user_vecs = user_emb[train_data['user_idx'].values]
    item_vecs = item_emb[train_data['item_idx'].values]
    preds = (user_vecs * item_vecs).sum(dim=1)

    loss = F.mse_loss(preds, torch.tensor(train_data['rating'].values, dtype=torch.float, device=device))
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch:02d} | Loss: {loss.item():.4f}")


IndexError: Found indices in 'edge_index' that are larger than 1493 (got 9733). Please ensure that all indices in 'edge_index' point to valid indices in the interval [0, 1494) in your node feature matrix and try again.