In [1]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [2]:
class RatingsDataset(Dataset):
    def __init__(self, csv_file, user_column, item_column, rating_column):
        df = pd.read_csv(csv_file)
        
        self.n_users = max(df[user_column].tolist())
        self.n_items = max(df[item_column].tolist())
        
        df[user_column] = df[user_column].apply(lambda x: x - 1)
        df[item_column] = df[item_column].apply(lambda x: x - 1)
        
        self.df = pd.DataFrame.from_dict({
            user_column: df[user_column].tolist(),
            item_column: df[item_column].tolist(),
            rating_column: df[rating_column].tolist(),
        })
        
        self.user_column = user_column
        self.item_column = item_column
        self.rating_column = rating_column
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        user_one_hot = torch.zeros(self.n_users)
        user_one_hot[row[self.user_column]] = 1
        
        item_one_hot = torch.zeros(self.n_items)
        item_one_hot[row[self.item_column]] = 1
        
        return user_one_hot, item_one_hot, row[self.rating_column]

In [3]:
ds = RatingsDataset(
    "/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/train_data.csv",
    "user_id",
    "movie_id",
    "rating",
)

In [4]:
dataloader = DataLoader(ds, batch_size=4)

In [5]:
for i_batch, sample_batch in enumerate(dataloader):
    vus, vis, rs = sample_batch
    print(vus)
    print(vis)
#     print(vis.argmax(dim=1))
    print(rs)
    break

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([5, 3, 4, 2])
