In [33]:
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
import scipy.sparse as sparse
import scipy.sparse.linalg as splg

def biased_als(data, iterations, lmbda, features):
    user_size, item_size = data.shape
    
    
    X = sparse.csr_matrix(np.random.normal(size = (user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    beta = np.zeros((1, user_size))
    gamma = np.zeros((item_size, 1))
    
    for _ in tqdm(range(iterations)):
        for i in range(item_size):
            r_i = data[:, i].T.toarray()
            r_i_beta = r_i - beta
            
            X = (sparse.hstack([np.ones((user_size, 1)), X]))
            Y = sparse.csr_matrix(sparse.hstack([gamma, Y]))     
            
            Y[i] = splg.inv(X.T @ X + lmbda * sparse.eye(features+1)) @ X.T @ r_i_beta.T
            
        for u in range(user_size):
            
            print('test')
            
            r_u = data[u, :].toarray()
            
            r_u_gamma = r_u - gamma
            
            Y = sparse.hstack([np.ones((1, item_size)), Y])
            
            X = (sparse.hstack([beta, X.T])).T
            
            X[u] = (Y.T @ Y + lmbda*I)**-1 @ Y.T @ r_u_gamma
            
    return X, Y

In [35]:
def biased_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):

    assert type(sparse_data) == sparse.csr_matrix


    # Calculate the foncidence for each value in our data
    confidence = sparse_data * alpha_val
    
    # Get the size of user rows and item columns
    user_size, item_size = sparse_data.shape
    
    # We create the user vectors X of size users-by-features, the item vectors
    # Y of size items-by-features and randomly assign the values.
    X = sparse.csr_matrix(np.random.normal(size = (user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    #Precompute I and lambda * I
    X_I = sparse.eye(user_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I
    
    for i in tqdm(range(iterations)):
        
        # Precompute Y-transpose-Y and X-transpose-X
        yTy = Y.T @ Y
        xTx = X.T @ X

        # Loop through all users
        for u in range(user_size):

            # Get the user row.
            u_row = confidence[u,:].toarray() 

            # Calculate the binary preference p(u)
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0

            # Calculate Cu and Cu - I
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I

            # Put it all together and compute the final formula
            yT_CuI_y = Y.T @ CuI @ Y
            yT_Cu_pu = Y.T @ Cu @ p_u.T
            X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)

    
        for i in range(item_size):

            # Get the item column and transpose it.
            i_row = confidence[:,i].T.toarray()

            # Calculate the binary preference p(i)
            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0

            # Calculate Ci and Ci - I
            CiI = sparse.diags(i_row, [0])
            Ci = CiI + X_I

            # Put it all together and compute the final formula
            xT_CiI_x = X.T @ CiI @ X
            xT_Ci_pi = X.T @ Ci @ p_i.T
            Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

    return X, Y

In [8]:
df_ratings = pd.read_csv('Final project/ml-1m/ratings.dat', sep="::", header=None)
df_ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

In [9]:
df_ratings.drop('timestamp', axis='columns', inplace=True)
df_ratings.dropna(inplace=True)

In [10]:
df_ratings['movie_num'] = df_ratings['movie_id'].astype("category").cat.codes
item_lookup = df_ratings[['movie_num', 'movie_id']].drop_duplicates()
item_lookup['movie_num'] = item_lookup.movie_num.astype(str)

In [11]:
users = list(np.sort(df_ratings.user_id.unique()))
movies = list(np.sort(df_ratings.movie_num.unique()))
ratings = list(df_ratings.rating)

In [12]:
rows = df_ratings.user_id.astype(int)
cols = df_ratings.movie_num.astype(int)

In [13]:
data_sparse = sparse.csr_matrix(
    (ratings, (rows-1, cols)),
    shape=(len(users), len(movies)))

In [34]:
user_vecs, item_vecs = biased_als(data_sparse, iterations=20, features=20, lmbda=10)

  0%|          | 0/20 [00:00<?, ?it/s]

6040


  warn('spsolve is more efficient when sparse b '
  self._set_arrayXarray(i, j, x)


ValueError: inconsistent shapes