In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [None]:
from datasets import load_dataset
import pandas as pd
from google.colab import drive
import random as random
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset with streaming enabled
dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_review_Home_and_Kitchen",
    streaming=True,  # Enable streaming to handle large data
    trust_remote_code=True
)

subset = []

for count, row in enumerate(dataset["full"]):
    subset.append(row)
    if count + 1 == 100000:
        break

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(subset)

# Define the path to save the CSV in Google Drive
output_path = "/content/drive/My Drive/home_reviews_100000.csv"

# Save the DataFrame to CSV
df.to_csv(output_path, index=False)

print(f"CSV file saved at: {output_path}")

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Amazon-Reviews-2023.py:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

CSV file saved at: /content/drive/My Drive/home_reviews_100000.csv


In [None]:
df = pd.read_csv("/content/drive/My Drive/home_reviews_100000.csv")
print("columns:", df.columns.tolist())
# make sure you see ['user_id','asin','rating', …]


columns: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase']


In [None]:
print(df.head())
print(df.columns)
print(df.info())

   rating                                            title  \
0     1.0   Received Used & scratched item! Purchased new!   
1     5.0         Excellent for moving & storage & floods!   
2     2.0  Lid very loose- needs a gasket imo. Small base.   
3     5.0                              Best purchase ever!   
4     5.0                              Excellent for yarn!   

                                                text  \
0  Livid.  Once again received an obviously used ...   
1  I purchased these for multiple reasons. The ma...   
2  [[VIDEOID:c87e962bc893a948856b0f1b285ce6cc]] I...   
3  If you live at a higher elevation like me (5k ...   
4  I use these to store yarn. They easily hold 12...   

                                              images        asin parent_asin  \
0                                                 []  B007WQ9YNO  B09XWYG6X1   
1                                                 []  B09H2VJW6K  B0BXDLF8TW   
2  [{'small_image_url': 'https://m.media-amazon.c.

In [None]:
# ─── 2) FILTER & SAMPLE ────────────────────────────────────────────────────────
# only users with >5 reviews
user_counts = df["user_id"].value_counts()
good_users  = user_counts[user_counts > 5].index
df = df[df["user_id"].isin(good_users)].copy()

# sample 1,000 users (optional)
import random
chosen = random.sample(good_users.tolist(), 1_000)
df = df[df["user_id"].isin(chosen)].reset_index(drop=True)

# ─── 3) LABEL & ENCODE ─────────────────────────────────────────────────────────
# binary label: rating >= 4 → positive
df["label"] = (df["rating"] >= 4).astype(int)

# build lookup maps
users = df["user_id"].unique().tolist()
items = df["asin"   ].unique().tolist()
user2idx = {u:i for i,u in enumerate(users, start=1)}
item2idx = {a:i for i,a in enumerate(items, start=1)}

df["user_idx"] = df["user_id"].map(user2idx).fillna(0).astype(int)
df["item_idx"] = df["asin"   ].map(item2idx).fillna(0).astype(int)

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
# ─── 4) SPLIT ─────────────────────────────────────────────────────────────────
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# ─── 5) DATASET ────────────────────────────────────────────────────────────────
class FMDataset(Dataset):
    def __init__(self, df):
        self.u = torch.LongTensor(df["user_idx"].values)
        self.i = torch.LongTensor(df["item_idx"].values)
        self.y = torch.FloatTensor(df["label"].values)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        Xi = torch.stack([self.u[idx], self.i[idx]]).unsqueeze(1)  # [fields=2,1]
        Xv = torch.ones_like(Xi, dtype=torch.float)               # all weights=1
        return Xi, Xv, self.y[idx]

batch_size    = 512
train_loader  = DataLoader(FMDataset(train_df), batch_size, shuffle=True)
val_loader    = DataLoader(FMDataset(val_df),   batch_size)

n_users = len(user2idx) + 1
n_items = len(item2idx) + 1

In [None]:

# ─── 6) DeepFM ────────────────────────────────────────────────────────────────
class DeepFM(nn.Module):
    def __init__(self, feature_sizes, emb_dim=8, hidden_dims=[32,32]):
        super().__init__()
        self.fm1 = nn.ModuleList([nn.Embedding(fs, 1)      for fs in feature_sizes])
        self.fm2 = nn.ModuleList([nn.Embedding(fs, emb_dim) for fs in feature_sizes])
        all_dims = [len(feature_sizes)*emb_dim] + hidden_dims
        self.linears = nn.ModuleList(
            nn.Linear(all_dims[i], all_dims[i+1])
            for i in range(len(hidden_dims))
        )
        self.out  = nn.Linear(all_dims[-1], 1)
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, Xi, Xv):
        B = Xi.size(0)
        # 1st‐order
        fm1_terms = []
        for i, emb in enumerate(self.fm1):
            idx = Xi[:,i,0]                   # [B]
            w   = Xv[:,i,0].unsqueeze(1)      # [B,1]
            fm1_terms.append(emb(idx)*w)      # [B,1]
        fm1 = torch.cat(fm1_terms, dim=1)     # [B,2]

        # 2nd‐order
        v_terms = []
        for i, emb in enumerate(self.fm2):
            idx = Xi[:,i,0]
            w   = Xv[:,i,0].unsqueeze(1)
            v_terms.append(emb(idx)*w)        # [B,emb_dim]
        summed    = sum(v_terms)
        summed_sq = summed*summed
        sq_sum    = sum(v*v for v in v_terms)
        fm2       = 0.5*(summed_sq - sq_sum) # [B,emb_dim]

        # deep part
        x = torch.cat(v_terms, dim=1)        # [B,2*emb_dim]
        for lin in self.linears:
            x = F.relu(lin(x))
        deep_out = self.out(x).squeeze(1)

        # combine
        return fm1.sum(1) + fm2.sum(1) + deep_out + self.bias  # [B]


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeepFM([n_users, n_items], emb_dim=8, hidden_dims=[32,32]).to(device)
opt   = torch.optim.Adam(model.parameters(), lr=1e-3)
crit  = nn.BCEWithLogitsLoss()


In [None]:
# ─── 7) TRAIN & VALIDATE ──────────────────────────────────────────────────────
def evaluate(loader):
    model.eval()
    tot, n = 0.0, 0
    with torch.no_grad():
        for Xi, Xv, y in loader:
            Xi, Xv, y = Xi.to(device), Xv.to(device), y.to(device)
            pred = model(Xi, Xv)
            tot += crit(pred, y).item()*y.size(0)
            n   += y.size(0)
    return tot/n

for ep in range(1,11):
    model.train()
    run, n = 0.0, 0
    for Xi, Xv, y in train_loader:
        Xi, Xv, y = Xi.to(device), Xv.to(device), y.to(device)
        opt.zero_grad()
        pred = model(Xi, Xv)
        loss = crit(pred, y)
        loss.backward()
        opt.step()
        run += loss.item()*y.size(0)
        n   += y.size(0)
    print(f"Epoch {ep} — train_loss: {run/n:.4f}, val_loss: {evaluate(val_loader):.4f}")

Epoch 1 — train_loss: 1.2375, val_loss: 1.0490
Epoch 2 — train_loss: 0.8949, val_loss: 0.8118
Epoch 3 — train_loss: 0.7816, val_loss: 0.7894
Epoch 4 — train_loss: 0.7520, val_loss: 0.7746
Epoch 5 — train_loss: 0.7280, val_loss: 0.7628
Epoch 6 — train_loss: 0.7062, val_loss: 0.7531
Epoch 7 — train_loss: 0.6866, val_loss: 0.7435
Epoch 8 — train_loss: 0.6674, val_loss: 0.7343
Epoch 9 — train_loss: 0.6497, val_loss: 0.7271
Epoch 10 — train_loss: 0.6327, val_loss: 0.7200


In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score

def compute_metrics(loader):
    model.eval()
    ys, ps = [], []
    with torch.no_grad():
        for Xi, Xv, y in loader:
            Xi, Xv = Xi.to(device), Xv.to(device)
            logits = model(Xi, Xv)           # [B]
            probas = torch.sigmoid(logits)   # [B]
            ys.append(y.cpu().numpy())
            ps.append(probas.cpu().numpy())
    y_true = np.concatenate(ys)
    y_pred = np.concatenate(ps)
    auc  = roc_auc_score(y_true, y_pred)
    pred_labels = (y_pred >= 0.5).astype(int)
    acc  = accuracy_score(y_true, pred_labels)
    print(f" AUC={auc:.4f},  Acc={acc:.4f}")

# after training:
compute_metrics(train_loader)
compute_metrics(val_loader)

 AUC=0.5892,  Acc=0.8016
 AUC=0.5321,  Acc=0.7740


In [None]:
import numpy as np
import torch

# reverse‐lookup from idx back to ASIN
idx2item   = {v:k for k,v in item2idx.items()}
# lookup from ASIN → title (from your merged df)
item2title = df.drop_duplicates("asin").set_index("asin")["title"].to_dict()

def recommend_top_k(model, user_raw, k=5):
    model.eval()
    # 1) get user index
    u_idx = user2idx.get(user_raw, 0)
    # 2) find items they already rated in train
    seen_asins = set(train_df[train_df["user_id"]==user_raw]["asin"])
    # 3) build candidate ASIN list
    candidates = [a for a in items if a not in seen_asins]
    N = len(candidates)
    if N==0: return []
    # 4) build tensors
    u_tensor   = torch.LongTensor([u_idx]*N).to(device)
    i_tensor   = torch.LongTensor([item2idx[a] for a in candidates]).to(device)
    Xi         = torch.stack([u_tensor, i_tensor], dim=1).unsqueeze(2)  # [N,2,1]
    Xv         = torch.ones_like(Xi, dtype=torch.float)
    # 5) score
    with torch.no_grad():
        logits  = model(Xi, Xv)               # [N]
        probs   = torch.sigmoid(logits).cpu().numpy()
    # 6) pick top-k
    idxs    = np.argsort(probs)[::-1][:k]
    results = []
    for i in idxs:
        asin  = candidates[i]
        title = item2title.get(asin, "")
        p     = float(probs[i])
        results.append((asin, title, p))
    return results

In [None]:
user = train_df.iloc[0]["user_id"]
for asin, title, score in recommend_top_k(model, user, k=5):
    print(f"{asin} → {title}  (p={score:.4f})")

B00MIN72JO → Four Stars  (p=1.0000)
B00IOTNDDA → BETTER THAN REAL....  (p=1.0000)
B00EDN43OE → This chair is cute but not overly comfortable. It is really packed full of fill  (p=1.0000)
B0755RJ9C8 → Comes in Handy  (p=1.0000)
B0765BTD13 → Does the job!  (p=1.0000)


In [17]:
def recommend_and_compare(model, user_raw, k=5):
    # 1) Grab their past purchases (in training set)
    seen_mask = train_df["user_id"] == user_raw
    past_asins = train_df.loc[seen_mask, "asin"].unique().tolist()
    past_titles = [item2title[a] for a in past_asins]

    # 2) Get top-k recs
    recs = recommend_top_k(model, user_raw, k)

    return past_asins, past_titles, recs

# Usage for the first user in train_df:
user = train_df.iloc[0]["user_id"]
past_asins, past_titles, recs = recommend_and_compare(model, user, k=5)

print(f"User {user} — past purchases:")
for a, t in zip(past_asins, past_titles):
    print(f"  {a} → {t}")

print("\nTop-5 DeepFM recommendations:")
for asin, title, score in recs:
    print(f"  {asin} → {title}  (p={score:.4f})")


User AF5UYBKAI373BZMFTLUQIYNXX4PA — past purchases:
  B075N9Q4KW → When grinding meat it would be easier with a second pair of hands.
  B079VW8D2R → Great set, great price!
  B00DN6T6LM → Great blanket
  B00TFBQBTO → Looks great
  B0BCQSP57G → Did not meet my expectation
  B00JZX3U8C → Will rust
  B084FXW67J → Great
  B00E9UNNB0 → Easy to use
  B07YFJ1QF1 → Great product
  B08TRM4V9J → Easy to use
  B000BPILY6 → They melt
  B099915KWZ → nice
  B08G1STVQK → Great product for the price paid.
  B000U9WXEC → Cheap product!!!!
  B010TCP3SC → Great product

Top-5 DeepFM recommendations:
  B00MIN72JO → Four Stars  (p=1.0000)
  B00IOTNDDA → BETTER THAN REAL....  (p=1.0000)
  B00EDN43OE → This chair is cute but not overly comfortable. It is really packed full of fill  (p=1.0000)
  B0755RJ9C8 → Comes in Handy  (p=1.0000)
  B0765BTD13 → Does the job!  (p=1.0000)


In [26]:
df = pd.read_csv("/content/drive/My Drive/home_reviews_joined_364000.csv")


In [41]:
print(df.columns.tolist())

['rating', 'review_title', 'text', 'asin', 'parent_asin', 'user_id', 'timestamp', 'verified_purchase', 'main_category', 'product_title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'store', 'categories', 'details', 'label', 'user_idx', 'item_idx']


In [42]:
print(df.head())
print(df.columns)
print(df.info())

   rating              review_title  \
0     1.0  So matcha disappointment   
1     5.0          Perfect mattress   
2     5.0           Great bed frame   
3     5.0         Great bar stools!   
4     4.0                 It works.   

                                                text        asin parent_asin  \
0  I bought this after reading a TON of reviews a...  B08GP9PFWG  B08GP9PFWG   
1  OK, we bought this mattress for our guest room...  B0777K9RGX  B0BPBLYF85   
2  Husband reports that the frame was easy to ass...  B07GX9RBN7  B0BJZ4Y8K1   
3  These stools (we purchased the cherry color) a...  B005EUJ5O8  B079C5FPW8   
4  I wanted a small, inexpensive entryway shoe ra...  B002IPG46Y  B0778KV29D   

                        user_id      timestamp  verified_purchase  \
0  AGGZ357AO26RQZVRLGU4D4N52DZQ  1644358432328               True   
1  AGGZ357AO26RQZVRLGU4D4N52DZQ  1578276993373               True   
2  AGGZ357AO26RQZVRLGU4D4N52DZQ  1578276798605               True   
3  AGGZ3

In [43]:
# ─── 2) FILTER & SAMPLE ────────────────────────────────────────────────────────
# only users with >5 reviews
user_counts = df["user_id"].value_counts()
good_users  = user_counts[user_counts > 5].index
df = df[df["user_id"].isin(good_users)].copy()

# sample 1,000 users (optional)
import random
chosen = random.sample(good_users.tolist(), 1_000)
df = df[df["user_id"].isin(chosen)].reset_index(drop=True)

# ─── 3) LABEL & ENCODE ─────────────────────────────────────────────────────────
# binary label: rating >= 4 → positive
df["label"] = (df["rating"] >= 4).astype(int)

# build lookup maps
users = df["user_id"].unique().tolist()
items = df["asin"   ].unique().tolist()
user2idx = {u:i for i,u in enumerate(users, start=1)}
item2idx = {a:i for i,a in enumerate(items, start=1)}

df["user_idx"] = df["user_id"].map(user2idx).fillna(0).astype(int)
df["item_idx"] = df["asin"   ].map(item2idx).fillna(0).astype(int)

# ─── 4) SPLIT ─────────────────────────────────────────────────────────────────
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# ─── 5) DATASET ────────────────────────────────────────────────────────────────
class FMDataset(Dataset):
    def __init__(self, df):
        self.u = torch.LongTensor(df["user_idx"].values)
        self.i = torch.LongTensor(df["item_idx"].values)
        self.y = torch.FloatTensor(df["label"].values)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        Xi = torch.stack([self.u[idx], self.i[idx]]).unsqueeze(1)  # [fields=2,1]
        Xv = torch.ones_like(Xi, dtype=torch.float)               # all weights=1
        return Xi, Xv, self.y[idx]

batch_size    = 512
train_loader  = DataLoader(FMDataset(train_df), batch_size, shuffle=True)
val_loader    = DataLoader(FMDataset(val_df),   batch_size)

n_users = len(user2idx) + 1
n_items = len(item2idx) + 1

# ─── 6) DeepFM ────────────────────────────────────────────────────────────────
class DeepFM(nn.Module):
    def __init__(self, feature_sizes, emb_dim=8, hidden_dims=[32,32]):
        super().__init__()
        self.fm1 = nn.ModuleList([nn.Embedding(fs, 1)      for fs in feature_sizes])
        self.fm2 = nn.ModuleList([nn.Embedding(fs, emb_dim) for fs in feature_sizes])
        all_dims = [len(feature_sizes)*emb_dim] + hidden_dims
        self.linears = nn.ModuleList(
            nn.Linear(all_dims[i], all_dims[i+1])
            for i in range(len(hidden_dims))
        )
        self.out  = nn.Linear(all_dims[-1], 1)
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, Xi, Xv):
        B = Xi.size(0)
        # 1st‐order
        fm1_terms = []
        for i, emb in enumerate(self.fm1):
            idx = Xi[:,i,0]                   # [B]
            w   = Xv[:,i,0].unsqueeze(1)      # [B,1]
            fm1_terms.append(emb(idx)*w)      # [B,1]
        fm1 = torch.cat(fm1_terms, dim=1)     # [B,2]

        # 2nd‐order
        v_terms = []
        for i, emb in enumerate(self.fm2):
            idx = Xi[:,i,0]
            w   = Xv[:,i,0].unsqueeze(1)
            v_terms.append(emb(idx)*w)        # [B,emb_dim]
        summed    = sum(v_terms)
        summed_sq = summed*summed
        sq_sum    = sum(v*v for v in v_terms)
        fm2       = 0.5*(summed_sq - sq_sum) # [B,emb_dim]

        # deep part
        x = torch.cat(v_terms, dim=1)        # [B,2*emb_dim]
        for lin in self.linears:
            x = F.relu(lin(x))
        deep_out = self.out(x).squeeze(1)

        # combine
        return fm1.sum(1) + fm2.sum(1) + deep_out + self.bias  # [B]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeepFM([n_users, n_items], emb_dim=8, hidden_dims=[32,32]).to(device)
opt   = torch.optim.Adam(model.parameters(), lr=1e-3)
crit  = nn.BCEWithLogitsLoss()




In [29]:
# ─── 7) TRAIN & VALIDATE ──────────────────────────────────────────────────────
def evaluate(loader):
    model.eval()
    tot, n = 0.0, 0
    with torch.no_grad():
        for Xi, Xv, y in loader:
            Xi, Xv, y = Xi.to(device), Xv.to(device), y.to(device)
            pred = model(Xi, Xv)
            tot += crit(pred, y).item()*y.size(0)
            n   += y.size(0)
    return tot/n

for ep in range(1,11):
    model.train()
    run, n = 0.0, 0
    for Xi, Xv, y in train_loader:
        Xi, Xv, y = Xi.to(device), Xv.to(device), y.to(device)
        opt.zero_grad()
        pred = model(Xi, Xv)
        loss = crit(pred, y)
        loss.backward()
        opt.step()
        run += loss.item()*y.size(0)
        n   += y.size(0)
    print(f"Epoch {ep} — train_loss: {run/n:.4f}, val_loss: {evaluate(val_loader):.4f}")

Epoch 1 — train_loss: 1.3902, val_loss: 1.3072
Epoch 2 — train_loss: 1.1958, val_loss: 1.1147
Epoch 3 — train_loss: 0.9918, val_loss: 0.9232
Epoch 4 — train_loss: 0.8429, val_loss: 0.8248
Epoch 5 — train_loss: 0.7856, val_loss: 0.7963
Epoch 6 — train_loss: 0.7622, val_loss: 0.7865
Epoch 7 — train_loss: 0.7430, val_loss: 0.7791
Epoch 8 — train_loss: 0.7263, val_loss: 0.7727
Epoch 9 — train_loss: 0.7110, val_loss: 0.7662
Epoch 10 — train_loss: 0.6960, val_loss: 0.7620


In [30]:
from sklearn.metrics import roc_auc_score, accuracy_score

def compute_metrics(loader):
    model.eval()
    ys, ps = [], []
    with torch.no_grad():
        for Xi, Xv, y in loader:
            Xi, Xv = Xi.to(device), Xv.to(device)
            logits = model(Xi, Xv)           # [B]
            probas = torch.sigmoid(logits)   # [B]
            ys.append(y.cpu().numpy())
            ps.append(probas.cpu().numpy())
    y_true = np.concatenate(ys)
    y_pred = np.concatenate(ps)
    auc  = roc_auc_score(y_true, y_pred)
    pred_labels = (y_pred >= 0.5).astype(int)
    acc  = accuracy_score(y_true, pred_labels)
    print(f" AUC={auc:.4f},  Acc={acc:.4f}")

# after training:
compute_metrics(train_loader)
compute_metrics(val_loader)

 AUC=0.5767,  Acc=0.7945
 AUC=0.5090,  Acc=0.7813


In [44]:
# 4) Build a lookup from ASIN → product_title
item2title = (
    train_df
      .drop_duplicates("asin")        # one title per ASIN
      .set_index("asin")["product_title"]
      .to_dict()
)

# 5) (re)define your recommend_and_compare
def recommend_and_compare(model, user_raw, k=5):
    # grab all ASINs this user has in *training* data
    seen_mask   = train_df["user_id"] == user_raw
    past_asins  = train_df.loc[seen_mask, "asin"].unique().tolist()
    # map to titles (use .get so we don’t KeyError if something’s missing)
    past_titles = [ item2title.get(a, "") for a in past_asins ]

    # get your top-k DeepFM recs
    recs = recommend_top_k(model, user_raw, k)

    return past_asins, past_titles, recs

# 6) Usage example
user = train_df.iloc[0]["user_id"]
past_asins, past_titles, recs = recommend_and_compare(model, user, k=5)

print(f"User {user} — past purchases:")
for a, t in zip(past_asins, past_titles):
    print(f"  {a}  →  {t}")

print("\nTop-5 DeepFM recommendations:")
for asin, title, score in recs:
    print(f"  {asin}  →  {title}  (p={score:.4f})")

KeyError: 'B001KW0CCI'