<a href="https://colab.research.google.com/github/mmvv11/recommender-colab/blob/main/2_MF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import time
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn

하이퍼파라미터

In [None]:
device="cuda" # 디바이스
n_neg=4 # 네거티브 샘플링 갯수
data_path = "./ml-100k_splited.pkl" # 데이터셋 경로
batch_size = 1024 # 훈련 데이터 배치 사이즈
emb_size = 8 # MF 임베딩 크기
lr = 1e-3
top_k = 20
n_epoch=10

## 데이터 로딩

In [None]:
with open(data_path, "rb") as f:
    data = pickle.load(f)

train, val, test, all_items, user2id, id2user, item2id, id2item = data.values()

In [None]:
class MLDataset(Dataset):
    def __init__(self, df, all_items, n_neg=4):
        super().__init__()
        self.n_neg=n_neg
        self.users, self.items, self.labels = self.get_data(df, all_items)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_data(self, df, all_items):
        users, items, labels = [], [], []
        user_item_set = set(zip(df['user'], df['item']))
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(self.n_neg):
                neg_item = np.random.choice(all_items)
                while (u, neg_item) in user_item_set:
                    neg_item = np.random.choice(all_items)
                users.append(u)
                items.append(neg_item)
                labels.append(0)
        return torch.tensor(users).to(device), torch.tensor(items).to(device), torch.tensor(labels, dtype=torch.float32).to(device)

In [None]:
train_dataset = MLDataset(train, all_items, )
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
user_consumed = train.groupby("user")['item'].apply(list).to_dict()
val_true = val.groupby("user")['item'].apply(list).to_dict()
test_true = test.groupby("user")['item'].apply(list).to_dict()

# MF 모델링

In [None]:
class MF(nn.Module):
    def __init__(self, n_users, n_items, emb_size):
        super(MF, self).__init__()
        self.emb_user = nn.Embedding(n_users, emb_size)
        self.emb_item = nn.Embedding(n_items, emb_size)
        self._init_weight()

    def _init_weight(self):
        nn.init.xavier_uniform_(self.emb_user.weight)
        nn.init.xavier_uniform_(self.emb_item.weight)

    def forward(self, user, item):
        emb_user = self.emb_user(user)
        emb_item = self.emb_item(item)
        return (emb_user*emb_item).sum(dim=1)

모델, 손실 함수, 옵티마이저 정의

In [None]:
n_users, n_items = len(user2id), len(item2id)

In [None]:
model = MF(n_users, n_items, emb_size)
model.to(device)

loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

메트릭
* precision
* recall
* nDCG

In [None]:
def get_precision(pred, true, k=20):
    intersection = set(pred).intersection(set(true))
    return len(intersection)/ k

def get_recall(pred, true, k=20):
    intersection = set(pred).intersection(set(true))
    return len(intersection)/len(true)

def get_nDCG(pred, true, k=20):
    intersection, _, idx_in_pred = np.intersect1d(true, pred, assume_unique=True, return_indices=True)
    if intersection.size == 0:
        return 0
    rank_list = np.zeros(k, np.float32)
    rank_list[idx_in_pred] = 1
    ideal_list = np.sort(rank_list)[::-1]
    dcg = np.sum(rank_list/np.log2(np.arange(2, k+2)))
    idcg = np.sum(ideal_list /np.log2(np.arange(2, k+2)))
    return dcg/idcg

train process 정의

In [None]:
for epoch in range(1, n_epoch+1):
    model.train()
    total_loss= 0
    for i, batch_data in enumerate(tqdm(train_loader, desc="train")):
        users, items, labels = batch_data
        pred = model(users, items)
        loss = loss_function(pred, labels)
        total_loss+=loss

        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    total_loss = total_loss.cpu().detach().numpy()
    print(f"epoch: {epoch}, total_loss: {np.mean(total_loss):.4f}")

    # validation
    model.eval()
    recall = np.array([])
    precision = np.array([])
    ndcg = np.array([])
    for u, true in tqdm(val_true.items(), desc="eval"):
        # 유저별 소비하지 않은 아이템
        unconsumed_items = list(set(all_items)-set(user_consumed[u]))
        unconsumed_items = torch.tensor(unconsumed_items).to(device)
        uu = torch.tensor([u]*len(unconsumed_items)).to(device)

        # 추론
        pred = model(uu, unconsumed_items)
        _, pred_idx = torch.topk(pred, k=top_k)
        top_k_items = unconsumed_items[pred_idx].tolist()

        # 메트릭
        recall=np.append(recall, get_recall(top_k_items, true))
        precision=np.append(precision, get_precision(top_k_items, true))
        ndcg=np.append(ndcg, get_nDCG(top_k_items, true))
    print(f"recall:{np.mean(recall):.4f}\nprecision:{np.mean(precision):.4f}\nndcg:{np.mean(ndcg):.4f}\n\n")

# validation
model.eval()
recall = np.array([])
precision = np.array([])
ndcg = np.array([])
for u, true in tqdm(test_true.items(), desc="test"):
    # 유저별 소비하지 않은 아이템
    unconsumed_items = list(set(all_items)-set(user_consumed[u]))
    unconsumed_items = torch.tensor(unconsumed_items).to(device)
    uu = torch.tensor([u]*len(unconsumed_items)).to(device)

    # 추론
    pred = model(uu, unconsumed_items)
    _, pred_idx = torch.topk(pred, k=top_k)
    top_k_items = unconsumed_items[pred_idx].tolist()

    # 메트릭
    recall=np.append(recall, get_recall(top_k_items, true))
    precision=np.append(precision, get_precision(top_k_items, true))
    ndcg=np.append(ndcg, get_nDCG(top_k_items, true))
print(f"\nrecall:{np.mean(recall):.4f}\nprecision:{np.mean(precision):.4f}\nndcg:{np.mean(ndcg):.4f}\n\n")

train: 100%|██████████| 391/391 [00:09<00:00, 42.80it/s]


epoch: 1, total_loss: 266.2305


eval: 100%|██████████| 943/943 [00:01<00:00, 535.62it/s]


recall:0.1545
precision:0.0800
ndcg:0.3325




train: 100%|██████████| 391/391 [00:06<00:00, 64.46it/s]


epoch: 2, total_loss: 190.3934


eval: 100%|██████████| 943/943 [00:01<00:00, 584.18it/s]


recall:0.1597
precision:0.0822
ndcg:0.3431




train: 100%|██████████| 391/391 [00:07<00:00, 55.54it/s]


epoch: 3, total_loss: 151.4196


eval: 100%|██████████| 943/943 [00:01<00:00, 590.60it/s]


recall:0.1625
precision:0.0824
ndcg:0.3454




train: 100%|██████████| 391/391 [00:06<00:00, 57.16it/s]


epoch: 4, total_loss: 144.8143


eval: 100%|██████████| 943/943 [00:01<00:00, 596.01it/s]


recall:0.1695
precision:0.0836
ndcg:0.3482




train: 100%|██████████| 391/391 [00:06<00:00, 58.48it/s]


epoch: 5, total_loss: 142.6620


eval: 100%|██████████| 943/943 [00:01<00:00, 550.13it/s]


recall:0.1696
precision:0.0843
ndcg:0.3486




train: 100%|██████████| 391/391 [00:07<00:00, 55.00it/s]


epoch: 6, total_loss: 141.5514


eval: 100%|██████████| 943/943 [00:01<00:00, 568.53it/s]


recall:0.1730
precision:0.0840
ndcg:0.3491




train: 100%|██████████| 391/391 [00:07<00:00, 54.70it/s]


epoch: 7, total_loss: 140.8562


eval: 100%|██████████| 943/943 [00:01<00:00, 596.48it/s]


recall:0.1727
precision:0.0846
ndcg:0.3487




train: 100%|██████████| 391/391 [00:05<00:00, 65.31it/s]


epoch: 8, total_loss: 140.3388


eval: 100%|██████████| 943/943 [00:01<00:00, 488.54it/s]


recall:0.1743
precision:0.0843
ndcg:0.3517




train: 100%|██████████| 391/391 [00:07<00:00, 55.59it/s]


epoch: 9, total_loss: 139.8463


eval: 100%|██████████| 943/943 [00:01<00:00, 592.64it/s]


recall:0.1745
precision:0.0850
ndcg:0.3516




train: 100%|██████████| 391/391 [00:06<00:00, 55.89it/s]


epoch: 10, total_loss: 139.2839


eval: 100%|██████████| 943/943 [00:01<00:00, 587.87it/s]


recall:0.1783
precision:0.0865
ndcg:0.3569




test: 100%|██████████| 943/943 [00:01<00:00, 585.80it/s]


recall:0.1686
precision:0.0817
ndcg:0.3438





