In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import sys

In [3]:
sys.path.append("../")

from utils import build_logger

logger = build_logger()

In [4]:
user_min = 5
item_min = 5

df = pd.read_csv("../inputs/ml-100k/u.data",
                 header=None,
                 sep="\t",
                 names=["user_id", "item_id", "rating", "time"],
                 dtype={
                     'user_id': 'int32',
                     'item_id': 'int32',
                     'rating': 'int32',
                     'time': 'int32'
                 })
df.head()

Unnamed: 0,user_id,item_id,rating,time
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
print('First pass')
print('num_users = {}'.format(df["user_id"].unique().size))
print('num_items = {}'.format(df["item_id"].unique().size))
print('df_shape  = {}'.format(df.shape))

First pass
num_users = 943
num_items = 1682
df_shape  = (100000, 4)


In [6]:
user_counts = df["user_id"].value_counts()
user_counts.head()

405    737
655    685
13     636
450    540
276    518
Name: user_id, dtype: int64

In [7]:
item_counts = df["item_id"].value_counts()
print('previous item shape: {}'.format(item_counts.size))

previous item shape: 1682


In [8]:
# get user and tiem category info
user_counts = user_counts[user_counts >= user_min]
item_counts = item_counts[item_counts >= item_min]

In [9]:
print("next user size: {}".format(user_counts.size))
print('next item size: {}'.format(item_counts.size))

next user size: 943
next item size: 1349


In [10]:
df = df[df.user_id.isin(user_counts.index)]
df = df[df.item_id.isin(item_counts.index)]

In [11]:
print('Second pass')
print('num_users = {}'.format(df["user_id"].unique().size))
print('num_items = {}'.format(df["item_id"].unique().size))
print('df_shape  = {}'.format(df.shape))

Second pass
num_users = 943
num_items = 1349
df_shape  = (99287, 4)


In [13]:
df = df.sort_values(by=['user_id', 'time'])
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,user_id,item_id,rating,time
0,1,168,5,874965478
1,1,172,5,874965478
2,1,165,5,874965518
3,1,156,4,874965556
4,1,196,5,874965677


In [15]:
# Add previous item
df['prev_item_id'] = df.item_id
df.prev_item_id = df.prev_item_id.shift(periods=1)
df.head()

Unnamed: 0,user_id,item_id,rating,time,prev_item_id
0,1,168,5,874965478,
1,1,172,5,874965478,168.0
2,1,165,5,874965518,172.0
3,1,156,4,874965556,165.0
4,1,196,5,874965677,156.0


In [18]:
# Negative sampling
df['neg_item_id'] = df.item_id.sample(df.shape[0]).values
df.head()

Unnamed: 0,user_id,item_id,rating,time,prev_item_id,neg_item_id
0,1,168,5,874965478,,174
1,1,172,5,874965478,168.0,322
2,1,165,5,874965518,172.0,946
3,1,156,4,874965556,165.0,251
4,1,196,5,874965677,156.0,404


In [14]:
# split train and test ddataframe
df = df.sort_values(by=['time'])
duplicate_mask = df.duplicated(subset=['user_id'], keep='last')
remain_df = df[duplicate_mask]
test_df = df[~duplicate_mask]
duplicate_mask = remain_df.duplicated(subset=['user_id'], keep='last')
train_df = remain_df[duplicate_mask]
valid_df = remain_df[~duplicate_mask]

In [15]:
logger.info("train shape {}".format(train_df.shape))
logger.info("valid shape {}".format(valid_df.shape))
logger.info("test shape {}".format(test_df.shape))

2019-07-12 22:42:39,032 - ..\utils.py - INFO - train shape (97401, 5)
2019-07-12 22:42:39,033 - ..\utils.py - INFO - valid shape (943, 5)
2019-07-12 22:42:39,033 - ..\utils.py - INFO - test shape (943, 5)


In [16]:
# encode feature
cat_names = ['user_id', 'item_id', 'prev_item_id']
ordinal_encoder = OrdinalEncoder(categories='auto', dtype='int32')

data = ordinal_encoder.fit_transform(train_df[cat_names])
train_df[cat_names] = data

data = ordinal_encoder.transform(valid_df[cat_names])
valid_df[cat_names] = data

data = ordinal_encoder.transform(test_df[cat_names])
test_df[cat_names] = data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_array(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [17]:
# Set first item non for each user
train_df.sort_values(by=['user_id'])
first_mask = ~train_df.duplicated(subset=['user_id'], keep='first')
train_df['prev_item_id'][first_mask] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [18]:
train_df.head()

Unnamed: 0,user_id,item_id,rating,time,prev_item_id
214,258,253,4,874724710,-1
83965,258,284,4,874724727,253
43027,258,296,4,874724754,284
21396,258,183,4,874724781,296
82655,258,171,4,874724843,183


In [19]:
train_df.dtypes

user_id         int32
item_id         int32
rating          int32
time            int32
prev_item_id    int32
dtype: object

In [20]:
train_df = train_df[cat_names]
valid_df = valid_df[cat_names]
test_df = test_df[cat_names]

In [21]:
one_data = train_df.iloc[1]
one_data

user_id         258
item_id         284
prev_item_id    253
Name: 83965, dtype: int32

In [22]:
one_data.values

array([258, 284, 253])

## Build pytorch Dataset

In [24]:
from torch.utils.data import Dataset, DataLoader

class MovieLenDataset(Dataset):
    def __init__(self, df: pd.DataFrame, dataset_type: str):
        self.dataset_type = dataset_type
        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        return self.df.iloc[idx].values

In [26]:
train_ds = MovieLenDataset(train_df, 'train')
valid_ds = MovieLenDataset(valid_df, 'valid')
test_ds = MovieLenDataset(test_df, 'test')

In [28]:
print(train_ds[1])
print(type(train_ds[1]))

[258 284 253]
<class 'numpy.ndarray'>


In [31]:
data_bunch = DataLoader(train_ds, batch_size=4, shuffle=True)

In [32]:
for i_batch, sample_batch in enumerate(data_bunch):
    print("num of batch", i_batch)
    print("sample batch", sample_batch)
    
    break

num of batch 0
sample batch tensor([[  37,   66, 1000],
        [ 120,   11,   49],
        [ 590,  170,  463],
        [ 234,   78,  235]], dtype=torch.int32)


In [40]:
import torch as T
from torch import Tensor

def cat_collate(batch) -> Tensor:
    return T.tensor(batch, dtype=T.long)

In [41]:
data_bunch = DataLoader(train_ds, batch_size=4,collate_fn=cat_collate, shuffle=True)

In [42]:
for i_batch, sample_batch in enumerate(data_bunch):
    print("num of batch", i_batch)
    print("sample batch", sample_batch)
    
    break

num of batch 0
sample batch tensor([[245, 467, 670],
        [442, 243, 341],
        [832, 394, 379],
        [647, 838, 668]])


In [43]:
sample_batch.dtype

torch.int64

In [44]:
sample_batch.shape

torch.Size([4, 3])

In [47]:
{
    name: cat_array
    for name, cat_array in zip(cat_names, ordinal_encoder.categories_)
}

{'user_id': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
        170, 171, 172, 173, 174, 175, 17