In [222]:
import random
import pandas as pd
import numpy as np

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import ndcg_score as ndcg
from sklearn.metrics import recall_score as recall
# from ignite.metrics.recall import Recall as t_recall
import torch


def implicit_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
    """ Implementation of Alternating Least Squares with implicit data. We iteratively
    compute the user (x_u) and item (y_i)from tor vectors using the following formulas:
 
    x_u = ((Y.T*Y + Y.T*(Cu - I) * Y) + lambda*I)^-1 * (X.T * Cu * p(u))
    y_i = ((X.T*X + X.T*(Ci - I) * X) + lambda*I)^-1 * (Y.T * Ci * p(i))
 
    Args:
        sparse_data (csr_matrix): Our sparse user-by-item matrix
 
        alpha_val (int): The rate in which we'll increase our confidence
        in a preference with more interactions.
 
        iterations (int): How many times we alternate between fixing and 
        updating our user and item vectors
 
        lambda_val (float): Regularization value
 
        features (int): How many latent features we want to compute.
    
    Returns:     
        X (csr_matrix): user vectors of size users-by-features
        
        Y (csr_matrix): item vectors of size items-by-features
     """

    assert type(sparse_data) == sparse.csr_matrix, "Matrix should be sparse in format of csr"


    # Calculate the foncidence for each value in our data
    confidence = sparse_data * alpha_val
    
    # Get the size of user rows and item columns
    user_size, item_size = sparse_data.shape
    
    # We create the user vectors X of size users-by-features, the item vectors
    # Y of size items-by-features and randomly assign the values.
    X = sparse.csr_matrix(np.random.normal(size = (user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    #Precompute I and lambda * I
    X_I = sparse.eye(user_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I
    
    for i in tqdm(range(iterations)):
        
        # Precompute Y-transpose-Y and X-transpose-X
        yTy = Y.T @ Y
        xTx = X.T @ X

        # Loop through all users
        for u in range(user_size):

            # Get the user row.
            u_row = confidence[u,:].toarray() 

            # Calculate the binary preference p(u)
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0

            # Calculate Cu and Cu - I
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I

            # Put it all together and compute the final formula
            yT_CuI_y = Y.T @ CuI @ Y
            yT_Cu_pu = Y.T @ Cu @ p_u.T
            X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)

    
        for i in xrange(item_size):

            # Get the item column and transpose it.
            i_row = confidence[:,i].T.toarray()

            # Calculate the binary preference p(i)
            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0

            # Calculate Ci and Ci - I
            CiI = sparse.diags(i_row, [0])
            Ci = CiI + X_I

            # Put it all together and compute the final formula
            xT_CiI_x = X.T @ CiI @ X
            xT_Ci_pi = X.T @ Ci @ p_i.T
            Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

    return X, Y

In [63]:
df_ratings = pd.read_csv('ml-1m/ratings.dat', sep="::", header=None)
#df_ratings.columns = "UserID::MovieID::Rating::Timestamp".split("::") 
df_ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df_ratings.head()

  return func(*args, **kwargs)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [217]:
!python -m pip install ignite

Defaulting to user installation because normal site-packages is not writeable
Collecting ignite
  Downloading ignite-1.1.0-py2.py3-none-any.whl (4.5 kB)
Installing collected packages: ignite
Successfully installed ignite-1.1.0




In [64]:
df_ratings.drop('timestamp', axis='columns', inplace=True)
df_ratings.dropna(inplace=True)

вот тут неочевидная херня с movie_num<br>
movie_id - это айдишник, но некоторых не хватает, и поэтому последний (3900+) получается больше, чем длина матрицы (3700+), и чтобы херни не было, мы записываем в num айдишник айдишника

In [65]:
df_ratings['movie_num'] = df_ratings['movie_id'].astype("category").cat.codes
item_lookup = df_ratings[['movie_num', 'movie_id']].drop_duplicates()
item_lookup['movie_num'] = item_lookup.movie_num.astype(str)

In [66]:
users = list(np.sort(df_ratings.user_id.unique()))
movies = list(np.sort(df_ratings.movie_num.unique()))
ratings = list(df_ratings.rating)

In [67]:
rows = df_ratings.user_id.astype(int)
cols = df_ratings.movie_num.astype(int)

In [68]:
data_sparse = sparse.csr_matrix((ratings, (rows-1, cols)), shape=(len(users), len(movies)))

In [71]:
user_vecs, item_vecs = implicit_als(data_sparse, iterations=20, features=20, alpha_val=40)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [17:10<00:00, 51.53s/it]


In [72]:
user_vecs

<6040x20 sparse matrix of type '<class 'numpy.float64'>'
	with 120800 stored elements in Compressed Sparse Row format>

In [73]:
import json

In [78]:
sparse.save_npz('user_vecs.npz', user_vecs)
sparse.save_npz('item_vecs.npz', item_vecs)
# with open('item_vecs.json','w') as f:
#     json.dump(item_vecs, f)

In [182]:
def recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup):
    """Recommend items for a given user given a trained model
    
    Args:
        user_id (int): The id of the user we want to create recommendations for.
        
        data_sparse (csr_matrix): Our original training data.
        
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        item_lookup (pandas.DataFrame): Used to map artist ids to artist names
        
        num_items (int): How many recommendations we want to return:
        
    Returns:
        recommendations (pandas.DataFrame): DataFrame with num_items artist names and scores
    
    """
  
    # Get all interactions by the user
    user_interactions = data_sparse[user_id,:].toarray()

    # We don't want to recommend items the user has consumed. So let's
    # set them all to 0 and the unknowns to 1.
    user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
    user_interactions[user_interactions > 1] = 0

    # This is where we calculate the recommendation by taking the 
    # dot-product of the user vectors with the item vectors.
    rec_vector = (user_vecs[user_id,:] @ item_vecs.T).toarray()

    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1, 1))[:,0]
    recommend_vector = user_interactions * rec_vector_scaled
   
    # Get all the artist indices in order of recommendations (descending) and
    # select only the top "num_items" items. 
    item_idx = np.argsort(recommend_vector)[::-1]

    movies = []
    scores = []

    # Loop through our recommended artist indicies and look up the actial artist name
    for idx in item_idx:
        movies.append(item_lookup.movie_id.loc[item_lookup.movie_num == str(idx)].iloc[0])
        scores.append(recommend_vector[idx])

    # Create a new dataframe with recommended artist names and scores
    recommendations = pd.DataFrame({'movies': movies, 'score': scores})
    
    return recommendations

In [183]:
# Let's generate and print our recommendations
user_id = 2023
recommendations = recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup)
print(recommendations)

      movies     score
0       2111  1.000000
1       3536  0.975370
2       2195  0.971656
3       3189  0.966057
4       1425  0.963888
...      ...       ...
3701    3129  0.000000
3702    3516  0.000000
3703     330  0.000000
3704    3524  0.000000
3705    2858  0.000000

[3706 rows x 2 columns]


In [186]:
dense_ratings_2023 = df_ratings[(df_ratings['user_id'] == 2023)& (df_ratings['rating'] > 0)]

In [190]:
compilation = dense_ratings_2023.join(recommendations, on='movie_id', how='inner')
compilation.score = compilation.score*5
compilation

Unnamed: 0,user_id,movie_id,rating,movie_num,movies,score
344705,2023,2987,4,2775,980,1.518134
344707,2023,2054,3,1873,1426,2.175180
344708,2023,720,5,689,3593,3.676643
344709,2023,1257,4,1165,2057,3.005665
344710,2023,2997,5,2785,3724,1.508944
...,...,...,...,...,...,...
344986,2023,569,4,555,3929,3.863135
344987,2023,1240,3,1148,3483,3.021868
344988,2023,1242,5,1150,2607,3.020942
344989,2023,2048,3,1867,519,2.180248


In [191]:
mae(compilation.rating, compilation.score)

1.6518961366489642

In [208]:
compilation['true'] = compilation.rating > 3
compilation['pred'] = compilation.score > 3
compilation['score_round'] = round(compilation.score).astype(int)
compilation

Unnamed: 0,user_id,movie_id,rating,movie_num,movies,score,true,pred,score_round
344705,2023,2987,4,2775,980,1.518134,True,False,2
344707,2023,2054,3,1873,1426,2.175180,False,False,2
344708,2023,720,5,689,3593,3.676643,True,True,4
344709,2023,1257,4,1165,2057,3.005665,True,True,3
344710,2023,2997,5,2785,3724,1.508944,True,False,2
...,...,...,...,...,...,...,...,...,...
344986,2023,569,4,555,3929,3.863135,True,True,4
344987,2023,1240,3,1148,3483,3.021868,False,True,3
344988,2023,1242,5,1150,2607,3.020942,True,True,3
344989,2023,2048,3,1867,519,2.180248,False,False,2


In [236]:
def recallTop(y_true, y_pred, rank=[10, 20, 30]):
    outer = []
    for x in range(len(y_pred)):
        pred_value = torch.tensor(y_pred[x])
        true_value = torch.tensor(y_true[x])
        pred_value = torch.round(pred_value)
        TP = torch.sum(torch.logical_and(true_value == 1, pred_value))   # True positives (predictions)
        inner = []
        for i in rank:
            TP_k = torch.sum(torch.logical_and(pred_value[:, :i] == 1, true_value[:, :i]))  # True positives @top 10, 20, 30
            inner.append(TP_k)
        avg = torch.div(torch.tensor(inner), TP)
        avg[torch.isnan(avg)] = 0
        outer.append(avg.tolist())

    return (np.array(outer)).mean(axis=0)

In [237]:
# torch.sum(torch.logical_and(pred_value[:, :i] == 1, true_value[:, :i]))  # True positives @top 10, 20, 30
true = list(compilation.true)
pred = list(compilation.pred)


print('recall', recallTop(true, pred))
print('ndcg', ndcg(compilation.rating, compilation.score_round))

RuntimeError: "round" "_vml_cpu" not implemented for 'Bool'

In [202]:
compilation[compilation['score']==0]

Unnamed: 0,user_id,movie_id,rating,movie_num,movies,score,true,pred
344802,2023,3471,2,3238,2335,0.0,False,False
344804,2023,3481,5,3248,2040,0.0,True,False
344806,2023,3489,3,3256,2012,0.0,False,False
344809,2023,3499,3,3266,3244,0.0,False,False
344814,2023,3668,2,3426,345,0.0,False,False
344816,2023,3672,3,3430,1269,0.0,False,False
344817,2023,3676,3,3434,357,0.0,False,False
344823,2023,3698,4,3456,1380,0.0,True,False
344941,2023,3578,4,3341,562,0.0,True,False


In [125]:
test_for_user(1517, 100, mse)

[0.08831605228206749, 32]

In [141]:
rand_users = random.sample(range(df_ratings.user_id.max()), 10)
print(rand_users)

[963, 4198, 2912, 3792, 4670, 3232, 1688, 45, 34, 2442]


In [178]:
def get_error(user_id, movie_id, rating_pred, error_func):
    rating_true = df_ratings[(df_ratings['user_id']==user_id) & (df_ratings['movie_id']==movie_id)]['rating'].iloc[0]
    print('rating from data:', rating_true)
    if len(rating_true) == 0:
        return
    else:
        try:
            return error_func(rating_true, rating_pred)
        except ValueError:
            return int((rating_true > 3 and rating_pred > 3) or (rating_true > 3 and rating_pred < 3))

In [179]:
# scores = []
# for u in rand_users:
#     scores.append(test_for_user(u, 250, ndcg))
# scores
# print('average err:', np.mean([s[0] for s in scores]))
# print('average hits per 250:', np.mean([s[1] for s in scores]))

In [180]:
user_id = 2912
df = recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup, num_items=100)
df

Unnamed: 0,movies,score
0,321,1.000000
1,1794,0.973006
2,568,0.971898
3,523,0.971413
4,30,0.971007
...,...,...
95,2442,0.915295
96,231,0.914656
97,1289,0.914624
98,2725,0.914349


In [181]:
errors = []
for movie, score in df.iterrows():
    errors.append(get_error(user_id, movie, score*5, mse))
errors

IndexError: single positional indexer is out-of-bounds

In [None]:
df_ratings[(df_ratings['user_id']==2912) & (df_ratings['movie_id']==movie_id)]['rating']