In [18]:
import random
import pandas as pd
import numpy as np

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import ndcg_score as ndcg
from sklearn.metrics import recall_score as recall
# from ignite.metrics.recall import Recall as t_recall
import torch


def implicit_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
    """ Implementation of Alternating Least Squares with implicit data. We iteratively
    compute the user (x_u) and item (y_i)from tor vectors using the following formulas:
 
    x_u = ((Y.T*Y + Y.T*(Cu - I) * Y) + lambda*I)^-1 * (X.T * Cu * p(u))
    y_i = ((X.T*X + X.T*(Ci - I) * X) + lambda*I)^-1 * (Y.T * Ci * p(i))
 
    Args:
        sparse_data (csr_matrix): Our sparse user-by-item matrix
 
        alpha_val (int): The rate in which we'll increase our confidence
        in a preference with more interactions.
 
        iterations (int): How many times we alternate between fixing and 
        updating our user and item vectors
 
        lambda_val (float): Regularization value
 
        features (int): How many latent features we want to compute.
    
    Returns:     
        X (csr_matrix): user vectors of size users-by-features
        
        Y (csr_matrix): item vectors of size items-by-features
     """

    assert type(sparse_data) == sparse.csr_matrix, "Matrix should be sparse in format of csr"


    # Calculate the foncidence for each value in our data
    confidence = sparse_data * alpha_val
    
    # Get the size of user rows and item columns
    user_size, item_size = sparse_data.shape
    
    # We create the user vectors X of size users-by-features, the item vectors
    # Y of size items-by-features and randomly assign the values.
    X = sparse.csr_matrix(np.random.normal(size = (user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    #Precompute I and lambda * I
    X_I = sparse.eye(user_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I
    
    for i in tqdm(range(iterations)):
        
        # Precompute Y-transpose-Y and X-transpose-X
        yTy = Y.T @ Y
        xTx = X.T @ X

        # Loop through all users
        for u in range(user_size):

            # Get the user row.
            u_row = confidence[u,:].toarray() 

            # Calculate the binary preference p(u)
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0

            # Calculate Cu and Cu - I
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I

            # Put it all together and compute the final formula
            yT_CuI_y = Y.T @ CuI @ Y
            yT_Cu_pu = Y.T @ Cu @ p_u.T
            X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)

    
        for i in range(item_size):

            # Get the item column and transpose it.
            i_row = confidence[:,i].T.toarray()

            # Calculate the binary preference p(i)
            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0

            # Calculate Ci and Ci - I
            CiI = sparse.diags(i_row, [0])
            Ci = CiI + X_I

            # Put it all together and compute the final formula
            xT_CiI_x = X.T @ CiI @ X
            xT_Ci_pi = X.T @ Ci @ p_i.T
            Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

    return X, Y

In [10]:
df_ratings = pd.read_csv("./Final project/ml-1m/ratings.dat", sep="::", header=None)
#df_ratings.columns = "UserID::MovieID::Rating::Timestamp".split("::") 
df_ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df_ratings.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
# !python -m pip install ignite

In [12]:
df_ratings.drop('timestamp', axis='columns', inplace=True)
df_ratings.dropna(inplace=True)

вот тут неочевидная херня с movie_num<br>
movie_id - это айдишник, но некоторых не хватает, и поэтому последний (3900+) получается больше, чем длина матрицы (3700+), и чтобы херни не было, мы записываем в num айдишник айдишника

In [13]:
df_ratings['movie_num'] = df_ratings['movie_id'].astype("category").cat.codes
item_lookup = df_ratings[['movie_num', 'movie_id']].drop_duplicates()
item_lookup['movie_num'] = item_lookup.movie_num.astype(str)

In [14]:
users = list(np.sort(df_ratings.user_id.unique()))
movies = list(np.sort(df_ratings.movie_num.unique()))
ratings = list(df_ratings.rating)

In [15]:
rows = df_ratings.user_id.astype(int)
cols = df_ratings.movie_num.astype(int)

In [16]:
data_sparse = sparse.csr_matrix((ratings, (rows-1, cols)), shape=(len(users), len(movies)))

In [20]:
user_vecs, item_vecs = implicit_als(data_sparse, iterations=1, features=20, alpha_val=40)

100%|██████████| 1/1 [01:12<00:00, 72.25s/it]


In [21]:
user_vecs

<6040x20 sparse matrix of type '<class 'numpy.float64'>'
	with 120800 stored elements in Compressed Sparse Row format>

In [22]:
import json

In [23]:
sparse.save_npz('user_vecs.npz', user_vecs)
sparse.save_npz('item_vecs.npz', item_vecs)
# with open('item_vecs.json','w') as f:
#     json.dump(item_vecs, f)

In [24]:
def recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup):
    """Recommend items for a given user given a trained model
    
    Args:
        user_id (int): The id of the user we want to create recommendations for.
        
        data_sparse (csr_matrix): Our original training data.
        
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        item_lookup (pandas.DataFrame): Used to map artist ids to artist names
        
        num_items (int): How many recommendations we want to return:
        
    Returns:
        recommendations (pandas.DataFrame): DataFrame with num_items artist names and scores
    
    """
  
    # Get all interactions by the user
    user_interactions = data_sparse[user_id,:].toarray()

    # We don't want to recommend items the user has consumed. So let's
    # set them all to 0 and the unknowns to 1.
    user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
    user_interactions[user_interactions > 1] = 0

    # This is where we calculate the recommendation by taking the 
    # dot-product of the user vectors with the item vectors.
    rec_vector = (user_vecs[user_id,:] @ item_vecs.T).toarray()

    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1, 1))[:,0]
    recommend_vector = user_interactions * rec_vector_scaled
   
    # Get all the artist indices in order of recommendations (descending) and
    # select only the top "num_items" items. 
    item_idx = np.argsort(recommend_vector)[::-1]

    movies = []
    scores = []

    # Loop through our recommended artist indicies and look up the actial artist name
    for idx in item_idx:
        movies.append(item_lookup.movie_id.loc[item_lookup.movie_num == str(idx)].iloc[0])
        scores.append(recommend_vector[idx])

    # Create a new dataframe with recommended artist names and scores
    recommendations = pd.DataFrame({'movies': movies, 'score': scores})
    
    return recommendations

In [25]:
# Let's generate and print our recommendations
user_id = 2023
recommendations = recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup)
print(recommendations)

      movies     score
0       1193  0.786367
1       1270  0.782152
2       1196  0.740509
3       1127  0.739115
4       3175  0.732281
...      ...       ...
3701     356  0.000000
3702     357  0.000000
3703    2081  0.000000
3704    2080  0.000000
3705    1057  0.000000

[3706 rows x 2 columns]


In [26]:
dense_ratings_2023 = df_ratings[(df_ratings['user_id'] == 2023)& (df_ratings['rating'] > 0)]

In [27]:
compilation = dense_ratings_2023.join(recommendations, on='movie_id', how='inner')
compilation.score = compilation.score*5
compilation

Unnamed: 0,user_id,movie_id,rating,movie_num,movies,score
344705,2023,2987,4,2775,998,0.776334
344707,2023,2054,3,1873,2790,0.820144
344708,2023,720,5,689,222,1.115161
344709,2023,1257,4,1165,3299,0.924053
344710,2023,2997,5,2785,1087,0.775334
...,...,...,...,...,...,...
344986,2023,569,4,555,1633,1.231195
344987,2023,1240,3,1148,1216,0.927884
344988,2023,1242,5,1150,68,0.927292
344989,2023,2048,3,1867,343,0.820373


In [29]:
compilation['score_round'] = round(compilation.score).astype(int)
compilation

Unnamed: 0,user_id,movie_id,rating,movie_num,movies,score,score_round
344705,2023,2987,4,2775,998,0.776334,1
344707,2023,2054,3,1873,2790,0.820144,1
344708,2023,720,5,689,222,1.115161,1
344709,2023,1257,4,1165,3299,0.924053,1
344710,2023,2997,5,2785,1087,0.775334,1
...,...,...,...,...,...,...,...
344986,2023,569,4,555,1633,1.231195,1
344987,2023,1240,3,1148,1216,0.927884,1
344988,2023,1242,5,1150,68,0.927292,1
344989,2023,2048,3,1867,343,0.820373,1


In [47]:
print('NDCG rounded scores:', ndcg(compilation.rating.values.reshape((1, -1)), compilation.score_round.values.reshape((1, -1)), k=100))
print('NDCG not rounded scores:', ndcg(compilation.rating.values.reshape((1, -1)), compilation.score.values.reshape((1, -1)), k=100))

NDCG rounded scores: 0.8118322362733045
NDCG not rounded scores: 0.7995145118612135


In [48]:
def recall(actual, predicted, k):
    act_set = set(actual)
    pred_set = set(predicted[:k])
    result = len(act_set & pred_set) / float(len(act_set))
    return result

In [73]:
# print(
#     'Not rounded scores\nRecall@20: {:.3f}\nRecall@50: {:.3f}\nNDCG@100: {:.3f}'.format(
#         recall(compilation.rating.values, compilation.score.values, k=20),
#         recall(compilation.rating.values, compilation.score.values, k=50),
#         ndcg(compilation.rating.values.reshape((1, -1)), compilation.score.values.reshape((1, -1)), k=100)
#     )
# )

In [69]:
# print(
#     'Rounded scores\nRecall@20: {:.3f}\nRecall@50: {:.3f}\nNDCG@100: {:.3f}'.format(
#         recall(compilation.rating.values, compilation.score_round.values, k=20),
#         recall(compilation.rating.values, compilation.score_round.values, k=50),
#         ndcg(compilation.rating.values.reshape((1, -1)), compilation.score_round.values.reshape((1, -1)), k=100)
#     )
# )

Для **recall** надо брать **округлённые** скоры (иначе он будет выдавать 0),    
для **NDCG** - тоже **округлённые** (с не округлёнными скор хуже),    
для **MAE** - **<span style="color:red">НЕ</span> округлённые**

In [74]:
print(
    'MAE: {:.3f}\nRecall@20: {:.3f}\nRecall@50: {:.3f}\nNDCG@100: {:.3f}'.format(
        mae(compilation.rating, compilation.score),
        recall(compilation.rating.values, compilation.score_round.values, k=20),
        recall(compilation.rating.values, compilation.score_round.values, k=50),
        ndcg(compilation.rating.values.reshape((1, -1)), compilation.score_round.values.reshape((1, -1)), k=100)
    )
)

MAE: 2.949
Recall@20: 0.400
Recall@50: 0.400
NDCG@100: 0.812
