In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

In [3]:
cd ..

C:\Projects\python\recommender


In [4]:
user_min = 5
item_min = 5

df = pd.read_csv("inputs/ml-100k/u.data",
                 header=None,
                 sep="\t",
                 names=["user_id", "item_id", "rating", "time"],
                 dtype={
                     'user_id': 'int32',
                     'item_id': 'int32',
                     'rating': 'int32',
                     'time': 'int32'
                 })
df.head()

Unnamed: 0,user_id,item_id,rating,time
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
df.shape

(100000, 4)

In [5]:
import torch as T
from torch.utils.data import Dataset, DataLoader

In [7]:
class MovielenDs(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
        
    def __getitem__(self, idx):
        return self.df.iloc[idx]
        
    def __len__(self):
        return self.df.shape[0]

In [9]:
ds = MovielenDs(df)
ds

<__main__.MovielenDs at 0x2ae6f03e7f0>

In [20]:
from typing import List

def my_collate(batch: List[pd.DataFrame]):
    df_batch = pd.DataFrame(batch)
    return df_batch

In [21]:
dl = DataLoader(ds, batch_size=2, collate_fn=my_collate)
dl

<torch.utils.data.dataloader.DataLoader at 0x2ae7076c898>

In [22]:
ds_it = iter(dl)
ds_it

<torch.utils.data.dataloader._DataLoaderIter at 0x2ae7076c7b8>

In [23]:
print(ds_it.next())

   user_id  item_id  rating       time
0      196      242       3  881250949
1      186      302       3  891717742


In [24]:
user_counts = df['user_id'].value_counts()
item_counts = df['item_id'].value_counts()

In [26]:
user_one_hot = sp.identity(user_counts.size).tocsr()
item_one_hot = sp.identity(item_counts.size).tocsr()

In [27]:
df_batch = ds_it.next()
df_batch

Unnamed: 0,user_id,item_id,rating,time
2,22,377,1,878887116
3,244,51,2,880606923


In [28]:
user_vector = user_one_hot[df_batch['user_id']]
item_vector = user_one_hot[df_batch['item_id']]

In [30]:
feat_matrix = sp.hstack([user_vector, item_vector])
feat_matrix

<2x1886 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in COOrdinate format>

In [31]:
feat_matrix.data

array([1., 1., 1., 1.])

In [34]:
feat_matrix.row

array([0, 1, 0, 1], dtype=int32)

In [35]:
feat_matrix.col

array([  22,  244, 1320,  994], dtype=int32)

In [37]:
feat_index = np.vstack(feat_matrix.nonzero())
feat_index

array([[   0,    1,    0,    1],
       [  22,  244, 1320,  994]], dtype=int32)

In [39]:
index_tensor = T.LongTensor(feat_index.tolist())

In [40]:
value_tensor = T.FloatTensor(feat_matrix.data)
value_tensor

tensor([1., 1., 1., 1.])

In [41]:
feat_tensor = T.sparse.FloatTensor(index_tensor, value_tensor)
feat_tensor

tensor(indices=tensor([[   0,    1,    0,    1],
                       [  22,  244, 1320,  994]]),
       values=tensor([1., 1., 1., 1.]),
       size=(2, 1321), nnz=4, layout=torch.sparse_coo)