<a href="https://colab.research.google.com/github/mmvv11/recommender-colab/blob/main/4_LightGCN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import time
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from scipy import sparse as ssp

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn

하이퍼파라미터

In [None]:
device="cuda" # 디바이스
n_neg=4 # 네거티브 샘플링 갯수
n_layers = 2 # GNN 레이어 갯수
dropout=0.0 # dropout rate
data_path = "./ml-100k_splited.pkl" # 데이터셋 경로
batch_size = 1024 # 훈련 데이터 배치 사이즈
emb_size = 8 # 임베딩 크기
lr = 1e-3
top_k = 20
n_epoch=10

## 데이터 로딩

In [None]:
with open(data_path, "rb") as f:
    data = pickle.load(f)

train, val, test, all_items, user2id, id2user, item2id, id2item = data.values()

In [None]:
class MLDataset(Dataset):
    def __init__(self, df, all_items, n_neg=4):
        super().__init__()
        self.n_neg=n_neg
        self.users, self.items, self.labels = self.get_data(df, all_items)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_data(self, df, all_items):
        users, pos_items, neg_items = [], [], []
        user_item_set = set(zip(df['user'], df['item']))
        for u, i in user_item_set:
            users.append(u)
            pos_items.append(i)
            for _ in range(self.n_neg):
                neg_item = np.random.choice(all_items)
                while (u, neg_item) in user_item_set:
                    neg_item = np.random.choice(all_items)
                neg_items.append(neg_item)
        return torch.tensor(users).to(device), torch.tensor(pos_items).to(device), torch.tensor(neg_items).to(device)

In [None]:
train_dataset = MLDataset(train, all_items, )
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
user_consumed = train.groupby("user")['item'].apply(list).to_dict()
val_true = val.groupby("user")['item'].apply(list).to_dict()
test_true = test.groupby("user")['item'].apply(list).to_dict()

# 모델링

In [None]:
class LightGCN(nn.Module):
    def __init__(self, n_users, n_items, emb_size, n_layers, user_consumed):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.emb_user = nn.Embedding(n_users, emb_size)
        self.emb_item = nn.Embedding(n_items, emb_size)
        self.n_layers = n_layers
        self.user_consumed = user_consumed
        self.laplacian_matrix = self._build_laplacian_matrix()
        self._init_weight()

    def _init_weight(self):
        nn.init.normal_(self.emb_user.weight, std=1e-2)
        nn.init.normal_(self.emb_item.weight, std=1e-2)

    def _build_laplacian_matrix(self):
        R = ssp.dok_matrix((self.n_users, self.n_items), dtype=np.float32)
        for u in range(self.n_users):
            items = self.user_consumed[u]
            R[u, items] = 1.0
        R = R.tolil()

        adj_matrix = ssp.dok_matrix(
            (self.n_users+self.n_items, self.n_items+self.n_users), dtype=np.float32
        )
        adj_matrix[:self.n_users, self.n_users:]=R
        adj_matrix[self.n_users:, :self.n_users]=R.T
        # adj_matrix = adj_matrix.tocsr()

        row_sum = np.array(adj_matrix.sum(axis=1)) # adj의 row sum은 각 유저/아이템 노드의 차수를 의미
        diag_inv = np.power(row_sum, -0.5).flatten()
        diag_inv[np.isinf(diag_inv)] = 0.0
        diag_matrix_inv = ssp.diags(diag_inv) # D^(-1/2)

        coo = diag_matrix_inv.dot(adj_matrix).dot(diag_matrix_inv).tocoo() # D^(-1/2) * A * D^(-1/2)
        indices = torch.from_numpy(np.array([coo.row, coo.col]))
        values = torch.from_numpy(coo.data)
        laplacian_matrix = torch.sparse_coo_tensor(
            indices, values, coo.shape, dtype=torch.float32, device=device
        )
        return laplacian_matrix

    def emb_propagation(self):
        all_emb = [
            torch.cat(
                [self.emb_user.weight, self.emb_item.weight], dim=0
            )
        ]

        for _ in range(n_layers):
            layered_emb = torch.sparse.mm(self.laplacian_matrix, all_emb[-1])
            all_emb.append(layered_emb)

        all_emb = torch.stack(all_emb, dim=1)
        mean_emb = torch.mean(all_emb, dim=1)

        layered_emb_user, layered_emb_item = torch.split(mean_emb, [self.n_users, self.n_items])
        return layered_emb_user, layered_emb_item

    def forward(self, users, pos_items, neg_items=None):
         # 임베딩 값을 가져오고 (복수, 단수형으로 변수명 구분할 것.)
        layered_emb_users, layered_emb_items = self.emb_propagation()
        # propagation 후 user, pos, neg
        layered_emb_user, layered_pos_emb, layered_neg_emb = layered_emb_users[users], layered_emb_items[pos_items], layered_emb_items[neg_items] if neg_items != None else None

        return layered_emb_user, layered_pos_emb, layered_neg_emb

모델, 손실 함수, 옵티마이저 정의

In [None]:
import torch.nn.functional as F

def bpr_loss(layered_emb_user, layered_pos_emb, layered_neg_emb):
    # reg loss는 forward return에 있는 init emb를 활용해서 별도 처리하기.
    pos_score = (layered_emb_user * layered_pos_emb).sum(dim=1)
    neg_score = (layered_emb_user * layered_neg_emb).sum(dim=1)
    log_sigmoid = F.logsigmoid(pos_score - neg_score)
    return torch.negative(torch.mean(log_sigmoid))

In [None]:
n_users, n_items = len(user2id), len(item2id)

In [None]:
model = LightGCN(n_users, n_items, emb_size, n_layers, user_consumed)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

  diag_inv = np.power(row_sum, -0.5).flatten()


메트릭
* precision
* recall
* nDCG

In [None]:
def get_precision(pred, true, k=20):
    intersection = set(pred).intersection(set(true))
    return len(intersection)/ k

def get_recall(pred, true, k=20):
    intersection = set(pred).intersection(set(true))
    return len(intersection)/len(true)

def get_nDCG(pred, true, k=20):
    intersection, _, idx_in_pred = np.intersect1d(true, pred, assume_unique=True, return_indices=True)
    if intersection.size == 0:
        return 0
    rank_list = np.zeros(k, np.float32)
    rank_list[idx_in_pred] = 1
    ideal_list = np.sort(rank_list)[::-1]
    dcg = np.sum(rank_list/np.log2(np.arange(2, k+2)))
    idcg = np.sum(ideal_list /np.log2(np.arange(2, k+2)))
    return dcg/idcg

train process 정의

In [None]:
for epoch in range(1, n_epoch+1):
    model.train()
    total_loss= []
    for i, batch_data in enumerate(tqdm(train_loader, desc="train")):
        users, pos_items, neg_items = batch_data
        layered_emb_user, layered_pos_emb, layered_neg_emb = model(users, pos_items, neg_items)
        loss = bpr_loss(layered_emb_user, layered_pos_emb, layered_neg_emb)
        total_loss.append(loss.item())

        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"epoch: {epoch}, total_loss: {np.mean(total_loss):.4f}")

    # validation
    model.eval()
    recall = np.array([])
    precision = np.array([])
    ndcg = np.array([])
    for u, true in tqdm(val_true.items(), desc="eval"):
        # 유저별 소비하지 않은 아이템
        unconsumed_items = list(set(all_items)-set(user_consumed[u]))
        unconsumed_items = torch.tensor(unconsumed_items).to(device)
        u = torch.tensor(u).to(device)

        # 추론
        layered_emb_user, layered_emb_item, _ = model(u, unconsumed_items)
        pred = (layered_emb_user * layered_emb_item).sum(dim=-1)
        _, pred_idx = torch.topk(pred, k=top_k)
        top_k_items = unconsumed_items[pred_idx].tolist()

        # 메트릭
        recall=np.append(recall, get_recall(top_k_items, true, k=top_k))
        precision=np.append(precision, get_precision(top_k_items, true, k=top_k))
        ndcg=np.append(ndcg, get_nDCG(top_k_items, true, k=top_k))
    print(f"recall:{np.mean(recall):.4f}\nprecision:{np.mean(precision):.4f}\nndcg:{np.mean(ndcg):.4f}")

# validation
model.eval()
recall = np.array([])
precision = np.array([])
ndcg = np.array([])
for u, true in tqdm(test_true.items(), desc="test"):
    # 유저별 소비하지 않은 아이템
    unconsumed_items = list(set(all_items)-set(user_consumed[u]))
    unconsumed_items = torch.tensor(unconsumed_items).to(device)
    u = torch.tensor(u).to(device)

    # 추론
    layered_emb_user, layered_emb_item, _ = model(u, unconsumed_items)
    pred = (layered_emb_user * layered_emb_item).sum(dim=-1)
    _, pred_idx = torch.topk(pred, k=top_k)
    top_k_items = unconsumed_items[pred_idx].tolist()

    # 메트릭
    recall=np.append(recall, get_recall(top_k_items, true, k=top_k))
    precision=np.append(precision, get_precision(top_k_items, true, k=top_k))
    ndcg=np.append(ndcg, get_nDCG(top_k_items, true, k=top_k))
print(f"recall:{np.mean(recall):.4f}\nprecision:{np.mean(precision):.4f}\nndcg:{np.mean(ndcg):.4f}")

train: 100%|██████████| 79/79 [00:02<00:00, 27.27it/s]


epoch: 1, total_loss: 0.6882


eval: 100%|██████████| 943/943 [00:02<00:00, 333.32it/s]


recall:0.1896
precision:0.0747
ndcg:0.3455


train: 100%|██████████| 79/79 [00:01<00:00, 58.39it/s]


epoch: 2, total_loss: 0.6439


eval: 100%|██████████| 943/943 [00:03<00:00, 288.34it/s]


recall:0.1894
precision:0.0742
ndcg:0.3440


train: 100%|██████████| 79/79 [00:01<00:00, 59.36it/s]


epoch: 3, total_loss: 0.5627


eval: 100%|██████████| 943/943 [00:02<00:00, 358.79it/s]


recall:0.1896
precision:0.0744
ndcg:0.3439


train: 100%|██████████| 79/79 [00:01<00:00, 59.98it/s]


epoch: 4, total_loss: 0.4838


eval: 100%|██████████| 943/943 [00:02<00:00, 368.43it/s]


recall:0.1884
precision:0.0745
ndcg:0.3451


train: 100%|██████████| 79/79 [00:01<00:00, 59.18it/s]


epoch: 5, total_loss: 0.4301


eval: 100%|██████████| 943/943 [00:03<00:00, 282.35it/s]


recall:0.1886
precision:0.0751
ndcg:0.3468


train: 100%|██████████| 79/79 [00:01<00:00, 60.15it/s]


epoch: 6, total_loss: 0.3978


eval: 100%|██████████| 943/943 [00:02<00:00, 361.42it/s]


recall:0.1893
precision:0.0758
ndcg:0.3468


train: 100%|██████████| 79/79 [00:01<00:00, 60.51it/s]


epoch: 7, total_loss: 0.3797


eval: 100%|██████████| 943/943 [00:02<00:00, 365.93it/s]


recall:0.1884
precision:0.0764
ndcg:0.3465


train: 100%|██████████| 79/79 [00:01<00:00, 56.55it/s]


epoch: 8, total_loss: 0.3688


eval: 100%|██████████| 943/943 [00:03<00:00, 290.05it/s]


recall:0.1894
precision:0.0770
ndcg:0.3475


train: 100%|██████████| 79/79 [00:01<00:00, 58.27it/s]


epoch: 9, total_loss: 0.3624


eval: 100%|██████████| 943/943 [00:02<00:00, 366.42it/s]


recall:0.1892
precision:0.0778
ndcg:0.3491


train: 100%|██████████| 79/79 [00:01<00:00, 58.63it/s]


epoch: 10, total_loss: 0.3581


eval: 100%|██████████| 943/943 [00:02<00:00, 365.33it/s]


recall:0.1894
precision:0.0784
ndcg:0.3496


test: 100%|██████████| 943/943 [00:03<00:00, 247.18it/s]

recall:0.1795
precision:0.0753
ndcg:0.3454



