In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import scipy.sparse as sp
import numpy as np
from collections import defaultdict
from typing import Dict, List
import copy
import os

In [3]:
user_min = 5
item_min = 5

df = pd.read_csv("../inputs/ml-100k/u.data",
                 header=None,
                 sep="\t",
                 names=["user_id", "item_id", "rating", "time"])
df.head()

Unnamed: 0,user_id,item_id,rating,time
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
print('First pass')
print('num_users = {}'.format(df["user_id"].unique().size))
print('num_items = {}'.format(df["item_id"].unique().size))
print('df_shape  = {}'.format(df.shape))

First pass
num_users = 943
num_items = 1682
df_shape  = (100000, 4)


In [5]:
user_counts = df["user_id"].value_counts()
user_counts.head()

405    737
655    685
13     636
450    540
276    518
Name: user_id, dtype: int64

In [6]:
item_counts = df["item_id"].value_counts()
item_counts.head()

50     583
258    509
100    508
181    507
294    485
Name: item_id, dtype: int64

In [7]:
item_counts.sort_values().head()

1663    1
1507    1
1201    1
1603    1
1577    1
Name: item_id, dtype: int64

In [8]:
df = df[df.apply(
    lambda x: user_counts[x['user_id']] >= user_min, axis=1)]
df = df[df.apply(
    lambda x: item_counts[x['item_id']] >= item_min, axis=1)]

In [9]:
print('Second pass')
print('num_users = {}'.format(df["user_id"].unique().size))
print('num_items = {}'.format(df["item_id"].unique().size))
print('df_shape  = {}'.format(df.shape))

Second pass
num_users = 943
num_items = 1349
df_shape  = (99287, 4)


In [10]:
print("Normalizing temporal values...")
mean_time = df["time"].mean()
std_time = df["time"].std()
df["time"] = (df["time"] - mean_time) / std_time

Normalizing temporal values...


In [11]:
print("Constructing datasets ...")
training_set = defaultdict(list)

Constructing datasets ...


In [12]:
num_users = 1
num_items = 1
item_to_idx = {}
user_to_idx = {}
idx_to_item = {}
idx_to_user = {}

In [13]:
for row in df.itertuples():
    # New item
    if row.item_id not in item_to_idx:
        item_to_idx[row.item_id] = num_items
        idx_to_item[num_items] = row.item_id
        num_items += 1

    # New user
    if row.user_id not in user_to_idx:
        user_to_idx[row.user_id] = num_users
        idx_to_user[num_users] = row.user_id
        num_users += 1

    # Converts all ratings to positive implicit feedback
    training_set[user_to_idx[row.user_id]].append(
        (item_to_idx[row.item_id], row.time))

for user in training_set:
    training_set[user].sort(key=lambda x: x[1])

In [14]:
training_set[1][:4]

[(1, -0.42407753863047365),
 (290, -0.42407753863047365),
 (490, -0.42407753863047365),
 (381, -0.42406406001501273)]

In [15]:
type(training_set[1])

list

In [16]:
training_times = {}
val_set = {}
val_times = {}
test_set = {}
test_times = {}

In [17]:
# Map from user to set of items for easy lookup
item_set_per_user = {}
for user in training_set:
    if len(training_set[user]) < 3:
        # Reviewed < 3 items, insert dummy values
        test_set[user] = (-1, -1)
        test_times[user] = (-1, -1)
        val_set[user] = (-1, -1)
        val_times[user] = (-1, -1)
    else:
        test_item, test_time = training_set[user].pop()
        val_item, val_time = training_set[user].pop()
        last_item, last_time = training_set[user][-1]
        test_set[user] = (test_item, val_item)
        test_times[user] = (test_time, val_time)
        val_set[user] = (val_item, last_item)
        val_times[user] = (val_time, last_time)

    # Separate timestamps and create item set
    training_times[user] = copy.deepcopy(training_set[user])
    training_set[user] = list(map(lambda x: x[0], training_set[user]))
    item_set_per_user[user] = set(training_set[user])

num_train_events = 0
for user in training_set:
    num_train_events += len(training_set[user])

In [19]:
print("training_times sample: {}".format(training_times[1][:4]))
print("training_set sample: {}".format(training_set[1][:4]))
print("item_set_per_user sample: {}".format(list(item_set_per_user[1])[:4]))

training_times sample: [(1, -0.42407753863047365), (290, -0.42407753863047365), (490, -0.42407753863047365), (381, -0.42406406001501273)]
training_set sample: [1, 290, 490, 381]
item_set_per_user sample: [1, 644, 390, 521]


In [20]:
user_df = pd.read_csv('../inputs/ml-100k/u.user', header=None, sep='|', names=['id','age', 'gender', 'occupation', 'zipCode'])
user_df.head()

Unnamed: 0,id,age,gender,occupation,zipCode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [21]:
user_df = user_df.set_index('id')
user_df.head()

Unnamed: 0_level_0,age,gender,occupation,zipCode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [22]:
orig_user_indices = []
for i in range(1, num_users):
    orig_user_indices.append(idx_to_user[i])
orig_user_indices[:4]

[196, 186, 22, 244]

In [23]:
user_feats = user_df.loc[orig_user_indices].values

In [25]:
user_one_hot = sp.identity(num_users - 1).tocsr()
item_one_hot = sp.identity(num_items - 1).tocsr()

In [26]:
# Sparse training matrices
train_rows = []
train_cols = []
train_vals = []
train_prev_vals = []
train_times = []
train_prev_times = []

In [27]:
for user in training_set:
    for i in range(1, len(training_set[user])):
        item = training_set[user][i]
        item_prev = training_set[user][i - 1]
        item_time = training_times[user][i]
        item_prev_time = training_times[user][i - 1]
        train_rows.append(user)
        train_cols.append(item)
        train_vals.append(1)
        train_prev_vals.append(item_prev)
        train_times.append(item_time[1])
        train_prev_times.append(item_prev_time[1])

In [28]:
sp_train = sp.coo_matrix((train_vals, (train_rows, train_cols)),
                         shape=(num_users, num_items))
sp_train_prev = sp.coo_matrix((train_prev_vals, (train_rows, train_cols)),
                              shape=(num_users, num_items))
sp_train_times = sp.coo_matrix((train_times, (train_rows, train_cols)),
                               shape=(num_users, num_items))
sp_train_prev_times = sp.coo_matrix(
    (train_prev_times, (train_rows, train_cols)), shape=(num_users, num_items))

In [29]:
user_indices = sp_train.row - 1
prev_indices = sp_train_prev.data - 1
pos_indices = sp_train.col - 1
neg_indices = np.random.randint(1, sp_train.shape[1], size=len(sp_train.row), dtype=np.int32) - 1

In [34]:
# generate neg data
data_range_list = np.arange(num_items)
neg_data = data_range_list[~np.isin(data_range_list, pos_indices)]
neg_indices = np.random.choice(neg_data, size=len(sp_train.row)) - 1

In [35]:
neg_indices.shape

(96458,)

In [36]:
users = user_one_hot[user_indices]
prev_items = item_one_hot[prev_indices]
pos_items = item_one_hot[pos_indices]
neg_items = item_one_hot[neg_indices]

In [37]:
pos_feats = sp.hstack([users, prev_items, pos_items])
neg_feats = sp.hstack([users, prev_items, pos_items])

In [39]:
users

<96458x943 sparse matrix of type '<class 'numpy.float64'>'
	with 96458 stored elements in Compressed Sparse Row format>

In [40]:
pos_feats

<96458x3641 sparse matrix of type '<class 'numpy.float64'>'
	with 289374 stored elements in COOrdinate format>