In [1]:
# === 0. Setup ===
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv, to_hetero_with_bases
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# === 1. Load and Clean Data ===
products = pd.read_csv("sephora_data/product_info.csv")
products = products.rename(columns={'product_id': 'itemID'})

review_files = [
    "sephora_data/reviews_0-250.csv",
    "sephora_data/reviews_250-500.csv",
    "sephora_data/reviews_500-750.csv",
    "sephora_data/reviews_750-1250.csv",
    "sephora_data/reviews_1250-end.csv"
]
reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
reviews = reviews.rename(columns={'author_id': 'userID', 'product_id': 'itemID', 'rating': 'rating'})
reviews = reviews.dropna(subset=['userID', 'itemID', 'rating'])
reviews = reviews.sample(n=10000, random_state=42)
reviews['userID'] = reviews['userID'].astype(str)
reviews['itemID'] = reviews['itemID'].astype(str)
reviews['rating'] = reviews['rating'].astype(float)

valid_item_ids = set(products['itemID'])
interactions = reviews[reviews['itemID'].isin(valid_item_ids)]
products = products[products['itemID'].isin(interactions['itemID'].unique())]

user_id_map = {uid: i for i, uid in enumerate(interactions['userID'].unique())}
item_id_map = {iid: i for i, iid in enumerate(interactions['itemID'].unique())}
interactions['user_idx'] = interactions['userID'].map(user_id_map)
interactions['item_idx'] = interactions['itemID'].map(item_id_map)

train_data, test_data = train_test_split(interactions, test_size=0.2, random_state=42)


  reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
  reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
  reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)


In [3]:
# === 2. Build Graph ===
edge_index = torch.tensor([
    interactions['user_idx'].values,
    interactions['item_idx'].values
], dtype=torch.long)

data = HeteroData()
data['user'].num_nodes = len(user_id_map)
data['item'].num_nodes = len(item_id_map)
data['user'].x = torch.zeros((len(user_id_map), 32))
data['user', 'rates', 'item'].edge_index = edge_index
data['item', 'rev_rates', 'user'].edge_index = edge_index[[1, 0]]

  edge_index = torch.tensor([


In [4]:
# === 3. TF-IDF for item features ===
products['content'] = (
    products['product_name'].fillna('') + ' ' +
    products['brand_name'].fillna('') + ' ' +
    products['primary_category'].fillna('') + ' ' +
    products['secondary_category'].fillna('') + ' ' +
    products['highlights'].fillna('')
)
tfidf = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf.fit_transform(products['content'].astype(str)).toarray()
product_id_to_idx = {pid: i for i, pid in enumerate(products['itemID'])}
item_features = np.zeros((len(item_id_map), tfidf_matrix.shape[1]))
for item_id, graph_idx in item_id_map.items():
    if item_id in product_id_to_idx:
        item_features[graph_idx] = tfidf_matrix[product_id_to_idx[item_id]]
data['item'].x = torch.tensor(item_features, dtype=torch.float)

In [5]:
# === 4. GNN Model Definition ===
class ManualGNN(nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.user_conv1 = SAGEConv((-1, -1), hidden_channels)
        self.item_conv1 = SAGEConv((-1, -1), hidden_channels)
        self.user_conv2 = SAGEConv((-1, -1), hidden_channels)
        self.item_conv2 = SAGEConv((-1, -1), hidden_channels)

    def forward(self, x_dict, edge_index_dict):
        x_item = self.item_conv1((x_dict['user'], x_dict['item']), edge_index_dict[('user', 'rates', 'item')])
        x_user = self.user_conv1((x_dict['item'], x_dict['user']), edge_index_dict[('item', 'rev_rates', 'user')])
        x_item = F.relu(x_item)
        x_user = F.relu(x_user)
        x_item = self.item_conv2((x_user, x_item), edge_index_dict[('user', 'rates', 'item')])
        x_user = self.user_conv2((x_item, x_user), edge_index_dict[('item', 'rev_rates', 'user')])
        return {'user': x_user, 'item': x_item}

In [6]:
# === 5. Training Loop ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)
model = ManualGNN(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

user_indices = torch.tensor(train_data['user_idx'].values, dtype=torch.long, device=device)
item_indices = torch.tensor(train_data['item_idx'].values, dtype=torch.long, device=device)
ratings = torch.tensor(train_data['rating'].values, dtype=torch.float, device=device)

model.train()
for epoch in range(1, 21):
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    user_vecs = out['user'][user_indices]
    item_vecs = out['item'][item_indices]
    preds = (user_vecs * item_vecs).sum(dim=1)
    loss = F.mse_loss(preds, ratings)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch:02d} | Loss: {loss.item():.4f}")


Epoch 01 | Loss: 20.0668
Epoch 02 | Loss: 11.7893
Epoch 03 | Loss: 1.6386
Epoch 04 | Loss: 41.1199
Epoch 05 | Loss: 1.7753
Epoch 06 | Loss: 5.9216
Epoch 07 | Loss: 10.9932
Epoch 08 | Loss: 13.3428
Epoch 09 | Loss: 14.1884
Epoch 10 | Loss: 14.3221
Epoch 11 | Loss: 14.0810
Epoch 12 | Loss: 13.5801
Epoch 13 | Loss: 12.8359
Epoch 14 | Loss: 11.8170
Epoch 15 | Loss: 10.4714
Epoch 16 | Loss: 8.7492
Epoch 17 | Loss: 6.6460
Epoch 18 | Loss: 4.3022
Epoch 19 | Loss: 2.2067
Epoch 20 | Loss: 1.4297


In [7]:
# === 6. Model Evaluation ===
model.eval()
with torch.no_grad():
    out = model(data.x_dict, data.edge_index_dict)
    user_test = torch.tensor(test_data['user_idx'].values, dtype=torch.long, device=device)
    item_test = torch.tensor(test_data['item_idx'].values, dtype=torch.long, device=device)
    ratings_test = torch.tensor(test_data['rating'].values, dtype=torch.float, device=device)

    user_vecs = out['user'][user_test]
    item_vecs = out['item'][item_test]
    preds = (user_vecs * item_vecs).sum(dim=1)

    test_loss = F.mse_loss(preds, ratings_test)
    print(f"Test MSE: {test_loss.item():.4f}")

Test MSE: 3.0241


In [8]:
# === 7. Testing for one user ===
user_id = list(user_id_map.keys())[0]  # pick first user
user_idx = user_id_map[user_id]
user_vec = out['user'][user_idx]

scores = torch.matmul(out['item'], user_vec)
topk = torch.topk(scores, k=5)

recommended_item_indices = topk.indices.cpu().numpy()
reverse_item_map = {v: k for k, v in item_id_map.items()}
recommended_item_ids = [reverse_item_map[i] for i in recommended_item_indices]

print("Top 5 recommended itemIDs for user", user_id, ":", recommended_item_ids)


Top 5 recommended itemIDs for user 7634976581 : ['P450614', 'P482529', 'P503191', 'P442539', 'P448937']
