In [1]:
import pandas as pd
import numpy as np
import random
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse.linalg as splin
import scipy.sparse as sparse
import json

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm

from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import ndcg_score as ndcg
from sklearn.metrics import recall_score as recall
# from ignite.metrics.recall import Recall as t_recall
import torch

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# 1 million MovieLens 

df_ratings = pd.read_csv('ml-1m/ratings.dat', sep="::", header=None)
df_ratings.columns = ['user_id', 'object_id', 'rating', 'timestamp']
df_ratings.drop('timestamp', axis='columns', inplace=True)
df_ratings.dropna(inplace=True)
df_ratings['user_id'] = df_ratings['user_id'].astype("category").cat.codes
df_ratings['object_id'] = df_ratings['object_id'].astype("category").cat.codes
df_ratings

Unnamed: 0,user_id,object_id,rating
0,0,1104,5
1,0,639,3
2,0,853,3
3,0,3177,4
4,0,2162,5
...,...,...,...
1000204,6039,1019,1
1000205,6039,1022,5
1000206,6039,548,5
1000207,6039,1024,4


In [4]:
def form_sparse_matrix(df, rows='user_id', columns='object_id'):
    rows = df[rows].astype("int")
    cols = df[columns].astype("int")

    ratings = df.rating.astype("int")

    data_sparse = sparse.csr_matrix(
        (ratings, (rows, cols)), 
        shape=(
            df.user_id.nunique(), 
            df.object_id.nunique() 
        ), 
        dtype='int32'
    )

    return data_sparse

In [5]:
def train_test_split(df, user_col, item_col='object_id', test_size=0.5):
    random.seed(123)
    test_indices = []
    for user in df[user_col].unique():
        df1 = df[df[user_col] == user]
        test_ind = random.sample(df[item_col].index.tolist(), k=int(len(df1)*test_size))
        test_indices.extend(test_ind)
    test_data = df.iloc[test_indices]
    train_data = df.drop(test_indices, axis=0)
    return train_data, test_data

In [6]:
df_ratings['user_id'].unique()

array([   0,    1,    2, ..., 6037, 6038, 6039], dtype=int16)

In [7]:
train_data, test_data = train_test_split(df_ratings, 'user_id', item_col='object_id', test_size=0.5)
train_data.shape, test_data.shape

((607814, 3), (498623, 3))

In [8]:
train_data.shape[0] + test_data.shape[0]

1106437

In [9]:
print(
    set(test_data.index) & set(train_data.index), 
    (sorted(list(set(test_data.index) | set(train_data.index))) == df_ratings.index).sum() == len(df_ratings)
)

set() True


In [10]:
def create_sparse(df, user_col, item_col, rating_col):
    print(df[rating_col].shape, df[user_col].nunique(), df_ratings[item_col].nunique())
    data_sparse = sparse.csr_matrix(
        (
            df[rating_col].values, 
            (df[user_col].values, df[item_col].values)
        ), 
#         shape=(
#             df[user_col].nunique(), df_ratings[item_col].nunique()
#         )
    ) # почему -1????????

    return data_sparse

In [11]:
df_sparse = create_sparse(df_ratings, 'user_id', 'object_id', 'rating')
df_sparse

(1000209,) 6040 3706


<6040x3706 sparse matrix of type '<class 'numpy.int64'>'
	with 1000209 stored elements in Compressed Sparse Row format>

In [12]:
# for u in df_ratings['user_id']:
#     if len(df_ratings[df_ratings['user_id'] == u]) < 2:
#         print(u)

In [37]:
train_sparse = create_sparse(train_data, 'user_id', 'object_id', 'rating')
test_sparse = create_sparse(test_data, 'user_id', 'object_id', 'rating')
train_sparse, test_sparse

(607814,) 6040 3706
(498623,) 6040 3706


(<6040x3706 sparse matrix of type '<class 'numpy.int64'>'
 	with 607814 stored elements in Compressed Sparse Row format>,
 <6040x3706 sparse matrix of type '<class 'numpy.int64'>'
 	with 392395 stored elements in Compressed Sparse Row format>)

In [26]:
class iALS:
    def __init__(self, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
        self.alpha_val = alpha_val
        self.iterations = iterations
        self.lambda_val = lambda_val
        self.features = features
    
    def fit(self, train_sparse_data):
        """ Implementation of Alternating Least Squares with implicit data. We iteratively
        compute the user (x_u) and item (y_i) from tor vectors using the following formulas:

        x_u = ((Y.T*Y + Y.T*(Cu - I) * Y) + lambda*I)^-1 * (X.T * Cu * p(u))
        y_i = ((X.T*X + X.T*(Ci - I) * X) + lambda*I)^-1 * (Y.T * Ci * p(i))

        Args:
            train_sparse_data (sparse.csr_matrix): Our sparse user-by-item matrix
            self.alpha_val (int): The rate in which we'll increase our confidence in a preference with more interactions
            self.iterations (int): How many times we alternate between fixing and updating our user and item vectors
            self.lambda_val (float): Regularization value
            self.features (int): How many latent self.features we want to compute

        Returns:     
            X (sparse.csr_matrix): user vectors of size users-by-self.features
            Y (sparse.csr_matrix): item vectors of size items-by-self.features
         """

        assert type(train_sparse_data) == sparse.csr_matrix, "Matrix should be sparse in format of csr"


        # Calculate the foncidence for each value in our data
        confidence = train_sparse_data * self.alpha_val

        # Get the size of user rows and item columns
        user_size, item_size = train_sparse_data.shape

        # We create the user vectors X of size users-by-self.features, the item vectors
        # Y of size items-by-self.features and randomly assign the values.
        X = sparse.csr_matrix(np.random.normal(size = (user_size, self.features)))
        Y = sparse.csr_matrix(np.random.normal(size = (item_size, self.features)))

        # Precompute I and lambda * I
        X_I = sparse.eye(user_size)
        Y_I = sparse.eye(item_size)

        I = sparse.eye(self.features)
        lI = self.lambda_val * I

        for i in tqdm(range(self.iterations)):
            # Precompute Y-transpose-Y and X-transpose-X
            yTy = Y.T @ Y
            xTx = X.T @ X

            # Loop through all users
            for u in range(user_size):

                # Get the user row.
                u_row = confidence[u,:].toarray() 

                # Calculate the binary preference p(u)
                p_u = u_row.copy()
                p_u[p_u != 0] = 1.0

                # Calculate Cu and Cu - I
                CuI = sparse.diags(u_row, [0])
                Cu = CuI + Y_I

                # Put it all together and compute the final formula
                yT_CuI_y = Y.T @ CuI @ Y
                yT_Cu_pu = Y.T @ Cu @ p_u.T
                X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)


            for i in range(item_size):
                # Get the item column and transpose it.
                i_row = confidence[:,i].T.toarray()

                # Calculate the binary preference p(i)
                p_i = i_row.copy()
                p_i[p_i != 0] = 1.0

                # Calculate Ci and Ci - I
                CiI = sparse.diags(i_row, [0])
                Ci = CiI + X_I

                # Put it all together and compute the final formula
                xT_CiI_x = X.T @ CiI @ X
                xT_Ci_pi = X.T @ Ci @ p_i.T
                Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

        return X, Y
    
    def predict(self, user_id, data_sparse, user_vecs, item_vecs, item_lookup):
        """Recommend items for a given user given a trained model

        Args:
            user_id (int): The id of the user we want to create recommendations for
            data_sparse (sparse.csr_matrix): Our original training data
            user_vecs (sparse.csr_matrix): The trained user x self.features vectors
            item_vecs (sparse.csr_matrix): The trained item x self.features vectors
            item_lookup (pd.DataFrame): Used to map artist ids to artist names
            num_items (int): How many recommendations we want to return:

        Returns:
            recommendations (pandas.DataFrame): DataFrame with num_items artist names and scores

        """

        # Get all interactions by the user
        user_interactions = data_sparse[user_id,:].toarray()

        # We don't want to recommend items the user has consumed. So let's
        # set them all to 0 and the unknowns to 1.
        user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
        user_interactions[user_interactions > 1] = 0

        # This is where we calculate the recommendation by taking the 
        # dot-product of the user vectors with the item vectors.
        rec_vector = (user_vecs[user_id,:] @ item_vecs.T).toarray()

        # Let's scale our scores between 0 and 1 to make it all easier to interpret.
        min_max = MinMaxScaler()
        rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1, 1))[:,0]
        recommend_vector = user_interactions * rec_vector_scaled

        # Get all the artist indices in order of recommendations (descending) and
        # select only the top "num_items" items. 
        item_idx = np.argsort(recommend_vector)[::-1]

        movies = []
        scores = []

        # Loop through our recommended artist indicies and look up the actial artist name
        for idx in item_idx:
            movies.append(item_lookup.object_id.loc[item_lookup.movie_num == str(idx)].iloc[0])
            scores.append(recommend_vector[idx])

        # Create a new dataframe with recommended artist names and scores
        recommendations = pd.DataFrame({'movies': movies, 'score': scores})

        return recommendations

In [27]:
model = iALS(iterations=1)

In [28]:
user_vecs, item_vecs = model.fit(train_sparse)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [29]:
df_ratings['movie_num'] = df_ratings['object_id'].astype("category").cat.codes
item_lookup = df_ratings[['movie_num', 'object_id']].drop_duplicates()
item_lookup['movie_num'] = item_lookup.movie_num.astype(str)

In [30]:
# rec = model.predict(1, test_sparse, user_vecs, item_vecs, item_lookup)

In [32]:
def recall(actual, predicted, k):
    act_set = set(actual)
    pred_set = set(predicted[:k])
    result = len(act_set & pred_set) / float(len(act_set))
    return result

In [39]:
all_users = df_ratings.user_id.unique()

rec20_list = []
rec50_list = []
NDCG_list = []

for user_id in tqdm(all_users[:100]):
    recommendations = model.predict(user_id, test_sparse, user_vecs, item_vecs, item_lookup)
    dense_ratings_user = test_data[(test_data['user_id'] == user_id) & (test_data['rating'] > 0)]

    compilation = pd.merge(dense_ratings_user, recommendations, left_on='object_id', right_on='movies')
    compilation['score']  = compilation.score*5
    compilation['score_round'] = round(compilation.score).astype(int)

    rec20 = recall(compilation.rating.values, compilation.score_round.values, k=20)
    rec50 = recall(compilation.rating.values, compilation.score_round.values, k=50)
    NDCG = ndcg(compilation.rating.values.reshape((1, -1)), compilation.score_round.values.reshape((1, -1)), k=100)

    rec20_list.append(rec20)
    rec50_list.append(rec50)
    NDCG_list.append(NDCG)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [40]:
np.array(rec20_list).mean(), np.array(rec50_list).mean(), np.array(NDCG_list).mean()

(0.0, 0.0, 0.9063585988048031)