In [1]:
from torch.utils.data import Dataset, DataLoader
import scipy.sparse as sparse
import pandas as pd

import numpy as np
import torch

In [2]:
preparations_tfidf = sparse.load_npz("../Preprocessing/objects/preparations_tfidf.npz")
tags_matrix = sparse.load_npz("../Preprocessing/objects/tags_matrix.npz")

In [3]:
tags_matrix.nonzero()

(array([     0,      0,      0, ..., 231636, 231636, 231636], dtype=int32),
 array([  4,   6,   7, ..., 378, 463, 528], dtype=int32))

In [None]:
import scipy

batch_size = 4
device = "cpu"

arange = torch.arange(batch_size, device=device)
ones = torch.ones(batch_size, dtype=torch.float32, device=device)


class FoodRatingsDataset(Dataset):
    def __init__(
        self, 
        interactions_file, 
        recipes_file,
        n_users, 
        n_items,
        has_rating_column=False,
    ):
        self.interactions = pd.read_csv(interactions_file)
        self.recipes = pd.read_csv(recipes_file)
        
        self.n_users = n_users
        self.n_items = n_items
        
        self.has_rating_column = has_rating_column
        
    def __len__(self):
        return len(self.interactions)
    
    def __getitem__(self, idx):
        row = self.interactions.iloc[idx]
        recipe_id = row["recipe_id"]
        
        recipe_data = self.recipes.iloc[recipe_id]
        response = (
            row["user_id"],
            recipe_id,
            np.array([recipe_data["minutes"]]),
            np.array(eval(recipe_data["nutrition"])),
            np.array([recipe_data["n_steps"]]),
            np.array([recipe_data["n_ingredients"]]),
        )
        
        return (*response, np.array([row["rating"]])) if self.has_rating_column else response

In [None]:
n_users = 226570
n_items = 231637

train_dataloader = DataLoader(
    FoodRatingsDataset(
        "../Preprocessing/processed_dataframes/test.csv",
        "../Preprocessing/processed_dataframes/sorted_recipes.csv",
        n_users,
        n_items,
        has_rating_column=True,
    ), 
    batch_size=batch_size,
    num_workers=1,
    shuffle=False,
)

In [None]:
def one_hot_encode_sparse(idxs, arange, ones, size, device):
    return torch.sparse_coo_tensor(
        torch.vstack([arange, idxs]),
        ones,
        size,
        dtype=torch.float32,
        device=device,
    )

def csr_to_torch_sparse(scipy_mat):
    row_idxs, col_idxs = scipy_mat.nonzero()
    indices = np.vstack([row_idxs, col_idxs])
    
    values = np.ones((len(row_idxs,)), dtype=np.float32)
    shape = scipy_mat.shape
    
    return torch.sparse_coo_tensor(indices, values, shape)


def to_sparse_batch(
    batch, 
    tfidf_mat, 
    tags_mat, 
    n_users, 
    n_items, 
    arange, 
    ones, 
    n_tags=552, 
    device="cpu", 
    ratings=True,
):
    (
        user_ids, 
        recipe_ids, 
        minutes, 
        nutritions, 
        steps,
        ingredients,
        ratings
    ) = batch
    
    batch_size = user_ids.shape[0]
    
    sparse_user_ids = one_hot_encode_sparse(user_ids, arange, ones, (batch_size, n_users), device)
#     sparse_recipe_ids = one_hot_encode_sparse(recipe_ids, arange, ones, (batch_size, n_items), device)
    
    sparse_tags = csr_to_torch_sparse(tags_mat[recipe_ids])
    sparse_tfidf = csr_to_torch_sparse(tfidf_mat[recipe_ids])
    
    recipe_feature_vector = torch.cat([
#         sparse_recipe_ids,
        minutes.to_sparse(),
        nutritions.to_sparse(),
        steps.to_sparse(),
        ingredients.to_sparse(),
        sparse_tags,
        sparse_tfidf,
    ], dim=1).to(device)
    
    return sparse_user_ids, recipe_feature_vector, ratings.to(device)

In [None]:
ones.dtype

In [None]:
for i_batch, batch in enumerate(train_dataloader):
    vus, vis, ratings = to_sparse_batch(batch, preparations_tfidf, tags_matrix, n_users, n_items, arange, ones)

    print(vus.shape)
    print(vis.shape)
    print(ratings.shape)
#     print(vis)
    break