In [1]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

In [2]:
def replace_zero_to_nan(m):
    m = m.astype(float)
    m[m==0] = np.nan
    return m

def calc_rmse(P, Q, R):
    R_pred = np.dot(P, Q.T)
    mse = np.nanmean((R - R_pred)**2)
    rmse = np.sqrt(mse)
    return rmse

In [3]:
df = pd.DataFrame({"user":np.array([[u]*5 for u in [0,1,2,3]]).reshape(-1), 
                   'item':[0,1,2,3,4]*4,
                   'ratings':[2,1,5,4,5, 
                              5, 4, 1, np.nan, 2,
                              1,1,5,2,2,
                              1, np.nan, np.nan, 4, 3]
                  })
R = df.pivot(index='item', columns='user', values='ratings').values

## w/o converting df to rating matrix

In [None]:
"""
1. hm.... how to vectorizer above np.dot(P, Q.T)?
2. how to deal with no rating rows?
"""


In [4]:
df.head()

Unnamed: 0,user,item,ratings
0,0,0,2.0
1,0,1,1.0
2,0,2,5.0
3,0,3,4.0
4,0,4,5.0


In [5]:
df.dropna(inplace=True)

In [6]:
n_users = df['user'].nunique()
n_items = df['item'].nunique()

K = 3
# intializing embedding vector from normal distribution with mean=0, std_dev=1/k
P = np.random.normal(loc=0, scale=1.0/K, size=(n_users, K))
Q = np.random.normal(loc=0, scale=1.0/K, size=(n_items, K))

In [None]:
user_emb = P[df['user']]
item_emb = Q[df['item']]
print(user_emb.shape)
print(item_emb.shape)

np.sum(user_emb + item_emb, axis=1)

# hm... sgd method updates user/item embedding vector each iteration. so prediction on one user
# will change each iteration that consists of that user. -> making prediction before hand on all
# interaction by vectorization is useless...

In [7]:
P, Q

(array([[-1.76929671e-01,  5.62878463e-01,  2.27582011e-04],
        [ 2.17595843e-01,  1.88689665e-01,  3.05486921e-01],
        [-4.38733853e-01,  2.24525947e-01,  1.46512954e-01],
        [ 1.92724393e-01,  2.72089027e-02, -1.82197032e-01]]),
 array([[-0.40296201,  0.08090255,  0.2103449 ],
        [-0.03172667, -0.34742306,  0.46983617],
        [ 0.45306162, -0.36357359, -0.51085631],
        [ 0.54384406, -0.22884538,  0.26698266],
        [-0.15368954,  0.32766316,  0.52380375]]))

In [8]:
lr = 0.01
_lambda = 0.01
epoch = 100

for i in range(epoch):
    for row in df.itertuples():
        user_id = getattr(row, "user")
        item_id = getattr(row, "item")
        r_ui = getattr(row, "ratings")
        pred_r_ui = np.dot(P[user_id], 
                           Q[item_id])
        e_ui = r_ui - pred_r_ui

        P[user_id] = P[user_id] + lr * (e_ui*Q[item_id] - _lambda*P[user_id])
        Q[item_id] = Q[item_id] + lr * (e_ui*P[user_id] - _lambda*Q[item_id])

In [30]:
user_emb = P[df['user']]
item_emb = Q[df['item']]

df['pred_rating'] = np.sum(user_emb * item_emb, axis=1)

In [32]:
df

Unnamed: 0,user,item,ratings,pred_rating
0,0,0,2.0,2.121935
1,0,1,1.0,1.023307
2,0,2,5.0,5.025296
3,0,3,4.0,4.259571
4,0,4,5.0,4.634675
5,1,0,5.0,4.953542
6,1,1,4.0,3.981638
7,1,2,1.0,0.994183
9,1,4,2.0,2.024196
10,2,0,1.0,0.893856


In [None]:
R = df.pivot(index="item", columns="user", values="ratings").values

In [None]:
df

In [None]:
R

In [None]:
n_users, n_items = R.shape
K=3
P = np.random.normal(loc=0, scale=1.0/K, size=(n_users, K))
Q = np.random.normal(loc=0, scale=1.0/K, size=(n_items, K))

for n_step in range(1, epoch):
    for i in range(n_users):
        for j in range(n_items):
            r_ui = R[i, j]
            if np.isnan(r_ui): # skip empty elements
                continue
            pred_r_ui = np.dot(P[i, :], Q[j, :])
            e_ui = r_ui - pred_r_ui
            P[i, :] = P[i, :] + lr * (e_ui * Q[j, :] - _lambda * P[i,:]) 
            Q[j, :] = Q[j, :] + lr * (e_ui * P[i, :] - _lambda * Q[j, :])
    if n_step % 10 == 0:
        print(f"{n_step} : RMSE =  ", np.round(calc_rmse(P, Q, R), 4))

In [None]:
R

In [None]:
np.dot(P, Q.T)

In [None]:
# embedding dimension
K = 3

# train test split, train takes 70% elements at random, other 6 become np.nan
train_ratio = 0.7

# Note since this is not deterministic train set may not consist exactly 70% of the data.
bool_matrix = (np.random.rand(n_user, n_item) < train_ratio)
train_R = replace_zero_to_nan(R * bool_matrix)
test_R = replace_zero_to_nan(R * ~bool_matrix)

In [None]:
n_steps = 100
lr = 0.01
_lambda = 0.01
regularization = True
# N = np.sum(bool_matrix)

# intializing embedding vector from normal distribution with mean=0, std_dev=1/k
P = np.random.normal(loc=0, scale=1.0/K, size=(n_user, K))
Q = np.random.normal(loc=0, scale=1.0/K, size=(n_item, K))

for n_step in range(1, n_steps):
    for i in range(n_user):
        for j in range(n_item):
            r_ui = R[i, j]
            if np.isnan(r_ui): # skip non-zero element
                continue   
            e_ui = r_ui - np.dot(P[i, :], Q[j, :])
            if not regularization:
                P[i, :] = P[i, :] + lr*e_ui*Q[j, :]
                Q[j, :] = Q[j, :] + lr*e_ui*P[i, :]
                continue

            # Done for user_i with all item embedding vectors that had interactions
            # then move on(all j is done, next i) to next user, repeat the process.
            P[i, :] = P[i, :] + lr * (e_ui*Q[j, :] - _lambda*P[i, :]) # updates user emb vector(P_i) using item emb vector Q_j
            Q[j, :] = Q[j, :] + lr * (e_ui*P[i, :] - _lambda*Q[j, :]) # use updated user emb vector to update Q_j
            
    if n_step % 10 == 0:
        print(f"{n_step} : RMSE =  ", np.round(calc_rmse(P, Q, train_R), 4))
        
print(f"Train RMSE =  ", np.round(calc_rmse(P, Q, train_R), 4))
print(f"Test RMSE =  ", np.round(calc_rmse(P, Q, test_R), 4))

In [None]:
pred_R = np.dot(P, Q.T)

In [None]:
item_list = ["vacuum", 'sweeper', 'laptop', 'mouse', 'keyboard']
user_list = df['user'].unique().tolist()

In [None]:
def preferenece_rec(user):
    """
    Given user output item that user will prefer.
    
    parameters
    ---------
    user : int, user_id
    """
    return item_list[np.argmax(pred_R[:, user])]

In [None]:
for user in user_list:
    rec_item = preferenece_rec(user)
    print(f"user_{user} will like = {rec_item}")