In [1]:
# === 0. Setup ===
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv, to_hetero_with_bases
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# === 1. Load and Clean Data ===
products = pd.read_csv("sephora_data/product_info.csv")
products = products.rename(columns={'product_id': 'itemID'})

review_files = [
    "sephora_data/reviews_0-250.csv",
    "sephora_data/reviews_250-500.csv",
    "sephora_data/reviews_500-750.csv",
    "sephora_data/reviews_750-1250.csv",
    "sephora_data/reviews_1250-end.csv"
]
reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
reviews = reviews.rename(columns={'author_id': 'userID', 'product_id': 'itemID', 'rating': 'rating'})
reviews = reviews.dropna(subset=['userID', 'itemID', 'rating'])
reviews = reviews.sample(n=10000, random_state=42)
reviews['userID'] = reviews['userID'].astype(str)
reviews['itemID'] = reviews['itemID'].astype(str)
reviews['rating'] = reviews['rating'].astype(float)

valid_item_ids = set(products['itemID'])
interactions = reviews[reviews['itemID'].isin(valid_item_ids)]
products = products[products['itemID'].isin(interactions['itemID'].unique())]

user_id_map = {uid: i for i, uid in enumerate(interactions['userID'].unique())}
item_id_map = {iid: i for i, iid in enumerate(interactions['itemID'].unique())}
interactions['user_idx'] = interactions['userID'].map(user_id_map)
interactions['item_idx'] = interactions['itemID'].map(item_id_map)

train_data, test_data = train_test_split(interactions, test_size=0.2, random_state=42)


  reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
  reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
  reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)


In [3]:
# === 2. Build Graph ===
edge_index = torch.tensor([
    interactions['user_idx'].values,
    interactions['item_idx'].values
], dtype=torch.long)

data = HeteroData()
data['user'].num_nodes = len(user_id_map)
data['item'].num_nodes = len(item_id_map)
data['user'].x = torch.zeros((len(user_id_map), 32))
data['user', 'rates', 'item'].edge_index = edge_index
data['item', 'rev_rates', 'user'].edge_index = edge_index[[1, 0]]

  edge_index = torch.tensor([


In [4]:
# === 3. TF-IDF for item features ===
products['content'] = (
    products['product_name'].fillna('') + ' ' +
    products['brand_name'].fillna('') + ' ' +
    products['primary_category'].fillna('') + ' ' +
    products['secondary_category'].fillna('') + ' ' +
    products['highlights'].fillna('')
)
tfidf = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf.fit_transform(products['content'].astype(str)).toarray()
product_id_to_idx = {pid: i for i, pid in enumerate(products['itemID'])}
item_features = np.zeros((len(item_id_map), tfidf_matrix.shape[1]))
for item_id, graph_idx in item_id_map.items():
    if item_id in product_id_to_idx:
        item_features[graph_idx] = tfidf_matrix[product_id_to_idx[item_id]]
data['item'].x = torch.tensor(item_features, dtype=torch.float)

In [5]:
# === 4. GNN Model Definition ===
class ManualGNN(nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.user_conv1 = SAGEConv((-1, -1), hidden_channels)
        self.item_conv1 = SAGEConv((-1, -1), hidden_channels)
        self.user_conv2 = SAGEConv((-1, -1), hidden_channels)
        self.item_conv2 = SAGEConv((-1, -1), hidden_channels)

    def forward(self, x_dict, edge_index_dict):
        x_item = self.item_conv1((x_dict['user'], x_dict['item']), edge_index_dict[('user', 'rates', 'item')])
        x_user = self.user_conv1((x_dict['item'], x_dict['user']), edge_index_dict[('item', 'rev_rates', 'user')])
        x_item = F.relu(x_item)
        x_user = F.relu(x_user)
        x_item = self.item_conv2((x_user, x_item), edge_index_dict[('user', 'rates', 'item')])
        x_user = self.user_conv2((x_item, x_user), edge_index_dict[('item', 'rev_rates', 'user')])
        return {'user': x_user, 'item': x_item}

In [6]:
# === 5. Training Loop ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)
model = ManualGNN(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

user_indices = torch.tensor(train_data['user_idx'].values, dtype=torch.long, device=device)
item_indices = torch.tensor(train_data['item_idx'].values, dtype=torch.long, device=device)
ratings = torch.tensor(train_data['rating'].values, dtype=torch.float, device=device)

model.train()
for epoch in range(1, 21):
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    user_vecs = out['user'][user_indices]
    item_vecs = out['item'][item_indices]
    preds = (user_vecs * item_vecs).sum(dim=1)
    loss = F.mse_loss(preds, ratings)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch:02d} | Loss: {loss.item():.4f}")


Epoch 01 | Loss: 20.0668
Epoch 02 | Loss: 11.7893
Epoch 03 | Loss: 1.6386
Epoch 04 | Loss: 41.1199
Epoch 05 | Loss: 1.7753
Epoch 06 | Loss: 5.9216
Epoch 07 | Loss: 10.9932
Epoch 08 | Loss: 13.3428
Epoch 09 | Loss: 14.1884
Epoch 10 | Loss: 14.3221
Epoch 11 | Loss: 14.0810
Epoch 12 | Loss: 13.5801
Epoch 13 | Loss: 12.8359
Epoch 14 | Loss: 11.8170
Epoch 15 | Loss: 10.4714
Epoch 16 | Loss: 8.7492
Epoch 17 | Loss: 6.6460
Epoch 18 | Loss: 4.3022
Epoch 19 | Loss: 2.2067
Epoch 20 | Loss: 1.4297


In [9]:
# === 6. Model Evaluation with Metrics ===
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error,
    accuracy_score, precision_score, recall_score, f1_score
)

model.eval()
with torch.no_grad():
    out = model(data.x_dict, data.edge_index_dict)
    
    # Test set
    user_test = torch.tensor(test_data['user_idx'].values, dtype=torch.long, device=device)
    item_test = torch.tensor(test_data['item_idx'].values, dtype=torch.long, device=device)
    ratings_test = torch.tensor(test_data['rating'].values, dtype=torch.float, device=device)

    # Get latent vectors
    user_vecs = out['user'][user_test]
    item_vecs = out['item'][item_test]
    preds = (user_vecs * item_vecs).sum(dim=1)

    # === Regression Metrics ===
    preds_np = preds.cpu().numpy()
    ratings_np = ratings_test.cpu().numpy()

    mse = mean_squared_error(ratings_np, preds_np)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(ratings_np, preds_np)

    print(f"\nRegression Metrics:")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MSE: {mse:.4f}")

    # === Classification Metrics ===
    # Define binary threshold (e.g. rating >= 4 is positive)
    threshold = 4.0
    binary_true = (ratings_np >= threshold).astype(int)
    binary_pred = (preds_np >= threshold).astype(int)

    acc = accuracy_score(binary_true, binary_pred)
    precision = precision_score(binary_true, binary_pred, zero_division=0)
    recall = recall_score(binary_true, binary_pred, zero_division=0)
    f1 = f1_score(binary_true, binary_pred, zero_division=0)

    print(f"\nClassification Metrics (Threshold = {threshold}):")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")


Regression Metrics:
MAE: 1.2541
RMSE: 1.7390
MSE: 3.0241

Classification Metrics (Threshold = 4.0):
Accuracy: 0.8190
Precision: 0.8190
Recall: 1.0000
F1 Score: 0.9005


In [None]:
#Interpretation:

#MAE < 1.5 is acceptable in many recommender systems, but could be improved.
#RMSE being higher than MAE indicates some big errors exist (outliers or poorly predicted users/items).
#Aiming to lower both via better GNNs or hyperparameter tuning.


#Recall = 1.0: The model is very cautious — it predicts nearly everything positive, so it doesn't miss any true likes.
#Precision = 0.819: Still quite good — though ~18% of your positive recommendations may be irrelevant.
#F1 = 0.9005: Strong balance between identifying all good items (recall) and being right when you say something is good (precision).

In [None]:
# === 6.1 Just the MSE (Can be ignored for later) ===
model.eval()
with torch.no_grad():
    out = model(data.x_dict, data.edge_index_dict)
    user_test = torch.tensor(test_data['user_idx'].values, dtype=torch.long, device=device)
    item_test = torch.tensor(test_data['item_idx'].values, dtype=torch.long, device=device)
    ratings_test = torch.tensor(test_data['rating'].values, dtype=torch.float, device=device)

    user_vecs = out['user'][user_test]
    item_vecs = out['item'][item_test]
    preds = (user_vecs * item_vecs).sum(dim=1)

    test_loss = F.mse_loss(preds, ratings_test)
    print(f"Test MSE: {test_loss.item():.4f}")

Test MSE: 3.0241


In [8]:
# === 7. Testing for one user ===
user_id = list(user_id_map.keys())[0]  # pick first user
user_idx = user_id_map[user_id]
user_vec = out['user'][user_idx]

scores = torch.matmul(out['item'], user_vec)
topk = torch.topk(scores, k=5)

recommended_item_indices = topk.indices.cpu().numpy()
reverse_item_map = {v: k for k, v in item_id_map.items()}
recommended_item_ids = [reverse_item_map[i] for i in recommended_item_indices]

print("Top 5 recommended itemIDs for user", user_id, ":", recommended_item_ids)


Top 5 recommended itemIDs for user 7634976581 : ['P450614', 'P482529', 'P503191', 'P442539', 'P448937']


In [10]:
# === 8. GAT Model Definition ===
from torch_geometric.nn import GATConv

class GATRecommender(nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.user_conv1 = GATConv((-1, -1), hidden_channels, heads=2, concat=False)
        self.item_conv1 = GATConv((-1, -1), hidden_channels, heads=2, concat=False)
        self.user_conv2 = GATConv((-1, -1), hidden_channels, heads=2, concat=False)
        self.item_conv2 = GATConv((-1, -1), hidden_channels, heads=2, concat=False)

    def forward(self, x_dict, edge_index_dict):
        x_item = self.item_conv1((x_dict['user'], x_dict['item']), edge_index_dict[('user', 'rates', 'item')])
        x_item = F.relu(x_item)

        x_user = self.user_conv1((x_dict['item'], x_dict['user']), edge_index_dict[('item', 'rev_rates', 'user')])
        x_user = F.relu(x_user)

        x_item = self.item_conv2((x_user, x_item), edge_index_dict[('user', 'rates', 'item')])
        x_item = F.relu(x_item)
        x_user = self.user_conv2((x_item, x_user), edge_index_dict[('item', 'rev_rates', 'user')])
        x_user = F.relu(x_user)

        return {'user': x_user, 'item': x_item}

In [11]:
# === 9. Training GAT Model ===
gat_model = GATRecommender(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(gat_model.parameters(), lr=0.005)

gat_model.train()
for epoch in range(1, 21):
    optimizer.zero_grad()
    out = gat_model(data.x_dict, data.edge_index_dict)

    user_vecs = out['user'][user_indices]
    item_vecs = out['item'][item_indices]

    preds = (user_vecs * item_vecs).sum(dim=1)
    loss = F.mse_loss(preds, ratings)

    loss.backward()
    optimizer.step()
    print(f"[GAT] Epoch {epoch:02d} | Loss: {loss.item():.4f}")

[GAT] Epoch 01 | Loss: 19.6992
[GAT] Epoch 02 | Loss: 19.4423
[GAT] Epoch 03 | Loss: 18.7199
[GAT] Epoch 04 | Loss: 16.9971
[GAT] Epoch 05 | Loss: 13.5137
[GAT] Epoch 06 | Loss: 7.8144
[GAT] Epoch 07 | Loss: 2.2884
[GAT] Epoch 08 | Loss: 11.2754
[GAT] Epoch 09 | Loss: 5.4946
[GAT] Epoch 10 | Loss: 2.0797
[GAT] Epoch 11 | Loss: 2.8334
[GAT] Epoch 12 | Loss: 4.4268
[GAT] Epoch 13 | Loss: 5.4494
[GAT] Epoch 14 | Loss: 5.6756
[GAT] Epoch 15 | Loss: 5.1837
[GAT] Epoch 16 | Loss: 4.1358
[GAT] Epoch 17 | Loss: 2.8298
[GAT] Epoch 18 | Loss: 1.8240
[GAT] Epoch 19 | Loss: 1.8706
[GAT] Epoch 20 | Loss: 2.9872


In [12]:
# === 10. Evaluate GAT Model ===
gat_model.eval()
with torch.no_grad():
    out = gat_model(data.x_dict, data.edge_index_dict)

    user_test = torch.tensor(test_data['user_idx'].values, dtype=torch.long, device=device)
    item_test = torch.tensor(test_data['item_idx'].values, dtype=torch.long, device=device)
    ratings_test = torch.tensor(test_data['rating'].values, dtype=torch.float, device=device)

    user_vecs = out['user'][user_test]
    item_vecs = out['item'][item_test]
    preds = (user_vecs * item_vecs).sum(dim=1)

    preds_np = preds.cpu().numpy()
    ratings_np = ratings_test.cpu().numpy()

    from sklearn.metrics import (
        mean_squared_error, mean_absolute_error,
        accuracy_score, precision_score, recall_score, f1_score
    )

    mse = mean_squared_error(ratings_np, preds_np)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(ratings_np, preds_np)

    binary_true = (ratings_np >= 4).astype(int)
    binary_pred = (preds_np >= 4).astype(int)

    acc = accuracy_score(binary_true, binary_pred)
    precision = precision_score(binary_true, binary_pred, zero_division=0)
    recall = recall_score(binary_true, binary_pred, zero_division=0)
    f1 = f1_score(binary_true, binary_pred, zero_division=0)

    print(f"\n[GAT Evaluation]")
    print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}, MSE: {mse:.4f}")
    print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


[GAT Evaluation]
MAE: 1.4319, RMSE: 1.8891, MSE: 3.5688
Accuracy: 0.7960, Precision: 0.8187, Recall: 0.9646, F1 Score: 0.8857


In [None]:
# Interpretations

#MAE (1.4319)
#Higher than GraphSAGE’s 1.2541 → on average, GAT makes slightly larger absolute errors.
#Suggests GAT is less precise in rating prediction, possibly because it's more focused on ranking rather than exact scores.

#RMSE (1.8891)
#Also higher than GraphSAGE’s 1.7390 → GAT makes more severe individual errors (squared error is more sensitive to outliers).

#But now the interesting part — Classification Metrics (Threshold ≥ 4)
#Accuracy & F1 Score: Slightly lower than GraphSAGE but still strong.
#Precision is almost identical (82%).
#Recall dropped slightly from perfect 1.0 to 0.9646 — which is still excellent and suggests GAT still captures most "liked" items.

#Conclusion for report:
#1. GAT does not outperform GraphSAGE in numeric accuracy (MAE/RMSE):
#GraphSAGE might be more stable in capturing exact rating values.
#This suggests that attention didn’t help the model better estimate exact ratings, at least with the current setup.

#2. GAT achieves nearly the same classification performance:
#F1, precision, and recall are all strong, with slightly more conservative behavior than GraphSAGE (lower recall).
#Suggests GAT is better at ranking top-N items based on general preference, even if predicted rating is off.

#Final note: While GAT introduces attention and slightly changes the model behavior, it does not outperform GraphSAGE in numeric error metrics (MAE/RMSE). 
# However, its classification performance remains strong, indicating it's still a reliable model for distinguishing liked products. 
# These results suggest that GraphSAGE may be more suitable for rating prediction, while GAT can still be useful in ranking or top-k recommendation settings.



In [13]:
# === 11. GAT Top 5 Recommendations for a User ===
user_vec = out['user'][user_idx]
scores = torch.matmul(out['item'], user_vec)
topk = torch.topk(scores, k=5)
recommended_item_indices = topk.indices.cpu().numpy()
recommended_item_ids = [reverse_item_map[i] for i in recommended_item_indices]

print("Top 5 recommended itemIDs by GAT for user", user_id, ":", recommended_item_ids)

Top 5 recommended itemIDs by GAT for user 7634976581 : ['P481347', 'P482320', 'P503191', 'P504644', 'P466155']
