In [None]:
# import required modules
import random
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, Tensor

from torch_sparse import SparseTensor, matmul

from torch_geometric.utils import structured_negative_sampling
from torch_geometric.data import download_url, extract_zip
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import Adj

In [None]:
movie_path = "../data/raw/ml-latest-small/movies.csv"
rating_path = "../data/raw/ml-latest-small/ratings.csv"

In [None]:
# load user and movie nodes
def load_node_csv(path, index_col):
    df = pd.read_csv(path, index_col=index_col)
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    return mapping


user_mapping = load_node_csv(rating_path, index_col="userId")
movie_mapping = load_node_csv(movie_path, index_col="movieId")

In [None]:
# load edges between users and movies
def load_edge_csv(
    path,
    src_index_col,
    src_mapping,
    dst_index_col,
    dst_mapping,
    link_index_col,
    rating_threshold=4,
):
    df = pd.read_csv(path)
    edge_index = None
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_attr = (
        torch.from_numpy(df[link_index_col].values).view(-1, 1).to(torch.long)
        >= rating_threshold
    )

    edge_index = [[], []]
    for i in range(edge_attr.shape[0]):
        if edge_attr[i]:
            edge_index[0].append(src[i])
            edge_index[1].append(dst[i])

    return torch.tensor(edge_index)


edge_index = load_edge_csv(
    rating_path,
    src_index_col="userId",
    src_mapping=user_mapping,
    dst_index_col="movieId",
    dst_mapping=movie_mapping,
    link_index_col="rating",
    rating_threshold=4,
)

In [None]:
num_users, num_movies = len(user_mapping), len(movie_mapping)
num_interactions = edge_index.shape[1]
all_indices = [i for i in range(num_interactions)]

train_indices, test_indices = train_test_split(
    all_indices, test_size=0.2, random_state=1
)
val_indices, test_indices = train_test_split(
    test_indices, test_size=0.5, random_state=1
)

train_edge_index = edge_index[:, train_indices]
val_edge_index = edge_index[:, val_indices]
test_edge_index = edge_index[:, test_indices]

In [None]:
train_sparse_edge_index = SparseTensor(
    row=train_edge_index[0],
    col=train_edge_index[1],
    sparse_sizes=(num_users + num_movies, num_users + num_movies),
)
val_sparse_edge_index = SparseTensor(
    row=val_edge_index[0],
    col=val_edge_index[1],
    sparse_sizes=(num_users + num_movies, num_users + num_movies),
)
test_sparse_edge_index = SparseTensor(
    row=test_edge_index[0],
    col=test_edge_index[1],
    sparse_sizes=(num_users + num_movies, num_users + num_movies),
)

# Random mini-batch sampling with negative samples


In [None]:
# function which random samples a mini-batch of positive and negative samples
def sample_mini_batch(batch_size, edge_index):
    edges = structured_negative_sampling(edge_index)
    edges = torch.stack(edges, dim=0)
    indices = random.choices([i for i in range(edges[0].shape[0])], k=batch_size)
    batch = edges[:, indices]
    user_indices, pos_item_indices, neg_item_indices = batch[0], batch[1], batch[2]
    return user_indices, pos_item_indices, neg_item_indices