In [1]:
from datetime import datetime, timedelta
from timeit import default_timer as timer

import numpy as np
import pandas as pd

In [2]:
def replace_zero_to_nan(m):
    m = m.astype(float)
    m[m==0] = np.nan
    return m

def calc_rmse_rating_matrix(P, Q, R):
    R_pred = np.dot(P, Q.T)
    mse = np.nanmean((R - R_pred)**2)
    rmse = np.sqrt(mse)
    return rmse

In [3]:
df = pd.DataFrame({"user":np.array([[u]*5 for u in [0,1,2,3]]).reshape(-1), 
                   'item':[0,1,2,3,4]*4,
                   'ratings':[2,1,5,4,5, 
                              5, 4, 1, np.nan, 2,
                              1,1,5,2,2,
                              1, np.nan, np.nan, 4, 3]
                  })
R = df.pivot(index='item', columns='user', values='ratings').values

## Rating df version

In [None]:
"""
1. hm.... how to vectorizer above np.dot(P, Q.T)?
2. how to deal with no rating rows?

https://towardsdatascience.com/heres-the-most-efficient-way-to-iterate-through-your-pandas-dataframe-4dad88ac92ee
"""


In [51]:
df = pd.read_parquet("../datasets/ml-25m/ratings_sampled.parquet").sample(10_000)
user_col, item_col, rating_col = "userId", "movieId", "rating"

In [52]:
"""
hm... would there be a way to automatically figure out which columns are user,item,rating? 
"""
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 15287779 to 447995
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     10000 non-null  int64  
 1   movieId    10000 non-null  int64  
 2   rating     10000 non-null  float64
 3   timestamp  10000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 390.6 KB


In [53]:
df.head()

Unnamed: 0_level_0,userId,movieId,rating,timestamp
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15287779,99053,56174,3.0,1571105611
7110528,46101,141771,1.0,1441356785
4094967,26943,1092,3.0,1175334092
20903076,135919,99007,2.5,1529077414
2506788,16709,33493,4.0,1175630515


In [46]:
"""
re-index user/item id
- so that it ranges from [0, n_user-1] -> for look-up of embedding vector.
- introductes extra bottleneck.

hm... Maybe there is vectorized way of doing this?
"""

start = timer()

userid2idx = {u_id:i for i, u_id in enumerate(df[user_col].unique())}
itemid2idx = {u_id:i for i, u_id in enumerate(df[item_col].unique())}

df[user_col] = df[user_col].apply(lambda x: userid2idx[x])
df[item_col] = df[item_col].apply(lambda x: itemid2idx[x])
end = timer()
print(f"took = {round(end - start, 3)}s")

took = 0.013s


In [47]:
def calc_mse(y: pd.Series, pred_y: pd.Series) -> float:
    return np.mean((y - pred_y)**2)
def calc_rmse(y: pd.Series, pred_y: pd.Series) -> float:
    return np.sqrt(calc_mse(y, pred_y))

In [48]:
# %%timeit
verbose = False
K = 3
lr = 0.01
_lambda = 0.01
epoch = 10

start = timer()
n_users = df[user_col].nunique()
n_items = df[item_col].nunique()
if verbose:
    print(f"n_users = {n_users}")
    print(f"n_items = {n_items}")

# intializing embedding vector from normal distribution with mean=0, std_dev=1/k
P = np.random.normal(loc=0, scale=1.0/K, size=(n_users, K))
Q = np.random.normal(loc=0, scale=1.0/K, size=(n_items, K))

for i_epoch in range(epoch):
    for row in df.itertuples():
        user_id = getattr(row, user_col)
        item_id = getattr(row, item_col)
        r_ui = getattr(row, rating_col)
        pred_r_ui = np.dot(P[user_id], 
                           Q[item_id])
        e_ui = r_ui - pred_r_ui

        P[user_id] = P[user_id] + lr * (e_ui*Q[item_id] - _lambda*P[user_id])
        Q[item_id] = Q[item_id] + lr * (e_ui*P[user_id] - _lambda*Q[item_id])
    
    if verbose:
        if i_epoch % 10 == 0:
            user_emb = P[df[user_col]]
            item_emb = Q[df[item_col]]
            df['pred_rating'] = np.sum(user_emb * item_emb, axis=1)
            print(f"{i_epoch} : RMSE =  ", np.round(calc_rmse(df["rating"], df["pred_rating"]), 4))

            
user_emb = P[df[user_col]]
item_emb = Q[df[item_col]]
df['pred_rating'] = np.sum(user_emb * item_emb, axis=1)

end = timer()
print(f"took = {round(end - start, 3)}s")

took = 1.332s


In [49]:
print(f"RMSE = {calc_rmse(df['rating'], df['pred_rating'])}")

RMSE = 3.297478631015532


In [None]:
"""
hm... sgd method updates user/item embedding vector each iteration. so prediction on one user
will change each iteration that consists of that user. -> making prediction before hand on all
interaction by vectorization is useless... unless all pairs have unique user and unique item.
"""
user_emb = P[df[user_col]]
item_emb = Q[df[item_col]]

pred_rating = np.sum(user_emb * item_emb, axis=1)

## Rating matrix version

In [40]:
df = pd.read_parquet("../datasets/ml-25m/ratings_sampled.parquet").sample(10_000)
user_col, item_col, rating_col = "userId", "movieId", "rating"
R = df.pivot(index=item_col, columns=user_col, values=rating_col).values

In [41]:
R.shape

(3673, 9022)

In [42]:
start = timer()
verbose = False
n_epoch = 10
n_users, n_items = R.shape
K=3
P = np.random.normal(loc=0, scale=1.0/K, size=(n_users, K))
Q = np.random.normal(loc=0, scale=1.0/K, size=(n_items, K))

for n_step in range(1, n_epoch):
    for i in range(n_users):
        for j in range(n_items):
            r_ui = R[i, j]
            if np.isnan(r_ui): # skip empty elements
                continue
            pred_r_ui = np.dot(P[i, :], Q[j, :])
            e_ui = r_ui - pred_r_ui
            P[i, :] = P[i, :] + lr * (e_ui * Q[j, :] - _lambda * P[i,:]) 
            Q[j, :] = Q[j, :] + lr * (e_ui * P[i, :] - _lambda * Q[j, :])
    if verbose:
        if n_step % 10 == 0:
            print(f"{n_step} : RMSE =  ", np.round(calc_rmse(P, Q, R), 4))
            
print(f"RMSE =  ", np.round(calc_rmse_rating_matrix(P, Q, R), 4))    
end = timer()
print(f"took = {round(end - start, 3)}s")

RMSE =   3.3867
took = 478.787s


In [None]:
# embedding dimension
K = 3

# train test split, train takes 70% elements at random, other 6 become np.nan
train_ratio = 0.7

# Note since this is not deterministic train set may not consist exactly 70% of the data.
bool_matrix = (np.random.rand(n_user, n_item) < train_ratio)
train_R = replace_zero_to_nan(R * bool_matrix)
test_R = replace_zero_to_nan(R * ~bool_matrix)

In [None]:
verbose = False
n_steps = 100
lr = 0.01
_lambda = 0.01
regularization = True
# N = np.sum(bool_matrix)

# intializing embedding vector from normal distribution with mean=0, std_dev=1/k
P = np.random.normal(loc=0, scale=1.0/K, size=(n_user, K))
Q = np.random.normal(loc=0, scale=1.0/K, size=(n_item, K))

for n_step in range(1, n_steps):
    for i in range(n_user):
        for j in range(n_item):
            r_ui = R[i, j]
            if np.isnan(r_ui): # skip non-zero element
                continue   
            e_ui = r_ui - np.dot(P[i, :], Q[j, :])
            if not regularization:
                P[i, :] = P[i, :] + lr*e_ui*Q[j, :]
                Q[j, :] = Q[j, :] + lr*e_ui*P[i, :]
                continue

            # Done for user_i with all item embedding vectors that had interactions
            # then move on(all j is done, next i) to next user, repeat the process.
            P[i, :] = P[i, :] + lr * (e_ui*Q[j, :] - _lambda*P[i, :]) # updates user emb vector(P_i) using item emb vector Q_j
            Q[j, :] = Q[j, :] + lr * (e_ui*P[i, :] - _lambda*Q[j, :]) # use updated user emb vector to update Q_j
    
    if verbose:
        if n_step % 10 == 0:
            print(f"{n_step} : RMSE =  ", np.round(calc_rmse(P, Q, train_R), 4))
        
print(f"RMSE =  ", np.round(calc_rmse(P, Q, train_R), 4))