In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [2]:
train = pd.read_csv("../data/processed/train.csv")
test = pd.read_csv("../data/processed/test.csv")

In [3]:
user_ids = train.user_id.unique()
item_ids = train.item_id.unique()

In [4]:
user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {i: j for j, i in enumerate(item_ids)}
idx2item = {j: i for i, j in item2idx.items()}

In [5]:
class InteractionDataset(Dataset):
    def __init__(self, df, num_items, neg_ratio=1):
        self.user_item = [(user2idx[u], item2idx[i]) 
                          for u, i in zip(df.user_id, df.item_id)]
        self.num_items = num_items
        self.neg_ratio = neg_ratio

    def __len__(self):
        return len(self.user_item) * (1 + self.neg_ratio)

    def __getitem__(self, idx):
        pos_idx = idx // (self.neg_ratio + 1)
        u, i = self.user_item[pos_idx]

        if idx % (self.neg_ratio + 1) == 0:
            return u, i, 1.0  # positive
        else:
            neg_item = np.random.randint(self.num_items)
            return u, neg_item, 0.0

In [6]:
class NeuralCF(nn.Module):
    def __init__(self, num_users, num_items, emb_dim=64):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)

    def forward(self, users, items):
        u = self.user_emb(users)
        i = self.item_emb(items)
        return (u * i).sum(dim=1)

In [7]:
dataset = InteractionDataset(train, len(item2idx))
loader = DataLoader(dataset, batch_size=1024, shuffle=True)

In [8]:
model = NeuralCF(len(user2idx), len(item2idx), emb_dim=64)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [9]:
# criterion = nn.BCELoss()

for epoch in range(10):
    total_loss = 0
    for users, items, labels in loader:
        scores = model(users, items)
        preds = torch.sigmoid(scores)

        loss = criterion(preds, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")


Epoch 1 Loss: 0.7633
Epoch 2 Loss: 0.7542
Epoch 3 Loss: 0.7257
Epoch 4 Loss: 0.7544
Epoch 5 Loss: 0.7456
Epoch 6 Loss: 0.7384
Epoch 7 Loss: 0.7353
Epoch 8 Loss: 0.7444
Epoch 9 Loss: 0.7439
Epoch 10 Loss: 0.7385


In [10]:
def recommend_embedding(user_id, k=10):
    if user_id not in user2idx:
        return []

    u_idx = user2idx[user_id]
    user_vec = model.user_emb.weight[u_idx]

    scores = torch.matmul(model.item_emb.weight, user_vec)
    ranked_items = torch.argsort(scores, descending=True)

    seen_items = set(train[train.user_id == user_id].item_id)

    recs = [
        idx2item[i.item()]
        for i in ranked_items
        if idx2item[i.item()] not in seen_items
    ]

    return recs[:k]


In [11]:
def recall_at_k(test_df, recommend_fn, k=10):
    """
    test_df: pandas DataFrame with columns [user_id, item_id]
    recommend_fn: function(user_id) -> list of recommended item_ids
    k: cutoff
    """
    recalls = []

    for user_id in test_df.user_id.unique():
        true_items = set(
            test_df[test_df.user_id == user_id].item_id
        )

        if len(true_items) == 0:
            continue

        recs = recommend_fn(user_id)

        if recs is None or len(recs) == 0:
            continue

        recs_k = set(recs[:k])
        recall = len(true_items & recs_k) / len(true_items)
        recalls.append(recall)

    return sum(recalls) / len(recalls) if len(recalls) > 0 else 0.0


In [17]:
common_users = set(train.user_id) & set(test.user_id)
test_emb = test[test.user_id.isin(common_users)]

recall_emb = recall_at_k(
    test_emb,
    lambda u: recommend_embedding(u, k=10),
    k=10
)

In [18]:
recall_emb

0.0

In [19]:
train_items = set(train.item_id.unique())
test_items = set(test_emb.item_id.unique())

len(test_items - train_items)

18

In [20]:
test_emb = test_emb[test_emb.item_id.isin(train_items)]

In [21]:
recall_emb = recall_at_k(
    test_emb,
    lambda u: recommend_embedding(u, k=10),
    k=10
)

recall_emb


0.0

In [22]:
u = test_emb.user_id.iloc[0]

print("True:", test_emb[test_emb.user_id == u].item_id.tolist())
print("Recs:", recommend_embedding(u, k=10))

True: ['B00000J1EP', 'B00004Z5M1']
Recs: ['B00000JBPB', 'B00000J061', 'B00000JYVT', 'B00004THCY', 'B00004NKLM', 'B00004WZOG', 'B00004TBBA', 'B0000222MY', 'B00003ETSJ', 'B00000K3RI']


In [23]:
u = test_emb.user_id.iloc[0]

recs = recommend_embedding(u, k=50)
true_items = set(test_emb[test_emb.user_id == u].item_id)

print("Num recs:", len(recs))
print("True items:", list(true_items)[:5])
print("First 10 recs:", recs[:10])
print("Overlap:", set(recs) & true_items)


Num recs: 50
True items: ['B00000J1EP', 'B00004Z5M1']
First 10 recs: ['B00000JBPB', 'B00000J061', 'B00000JYVT', 'B00004THCY', 'B00004NKLM', 'B00004WZOG', 'B00004TBBA', 'B0000222MY', 'B00003ETSJ', 'B00000K3RI']
Overlap: set()
