In [70]:
import random
import pandas as pd
import numpy as np

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm


def implicit_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
    """ Implementation of Alternating Least Squares with implicit data. We iteratively
    compute the user (x_u) and item (y_i) vectors using the following formulas:
 
    x_u = ((Y.T*Y + Y.T*(Cu - I) * Y) + lambda*I)^-1 * (X.T * Cu * p(u))
    y_i = ((X.T*X + X.T*(Ci - I) * X) + lambda*I)^-1 * (Y.T * Ci * p(i))
 
    Args:
        sparse_data (csr_matrix): Our sparse user-by-item matrix
 
        alpha_val (int): The rate in which we'll increase our confidence
        in a preference with more interactions.
 
        iterations (int): How many times we alternate between fixing and 
        updating our user and item vectors
 
        lambda_val (float): Regularization value
 
        features (int): How many latent features we want to compute.
    
    Returns:     
        X (csr_matrix): user vectors of size users-by-features
        
        Y (csr_matrix): item vectors of size items-by-features
     """

    assert type(sparse_data) == sparse.csr_matrix, "Matrix should be sparse in format of csr"


    # Calculate the foncidence for each value in our data
    confidence = sparse_data * alpha_val
    
    # Get the size of user rows and item columns
    user_size, item_size = sparse_data.shape
    
    # We create the user vectors X of size users-by-features, the item vectors
    # Y of size items-by-features and randomly assign the values.
    X = sparse.csr_matrix(np.random.normal(size = (user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    #Precompute I and lambda * I
    X_I = sparse.eye(user_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I
    
    for i in tqdm(range(iterations)):
        
        # Precompute Y-transpose-Y and X-transpose-X
        yTy = Y.T @ Y
        xTx = X.T @ X

        # Loop through all users
        for u in range(user_size):

            # Get the user row.
            u_row = confidence[u,:].toarray() 

            # Calculate the binary preference p(u)
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0

            # Calculate Cu and Cu - I
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I

            # Put it all together and compute the final formula
            yT_CuI_y = Y.T @ CuI @ Y
            yT_Cu_pu = Y.T @ Cu @ p_u.T
            X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)

    
        for i in range(item_size):

            # Get the item column and transpose it.
            i_row = confidence[:,i].T.toarray()

            # Calculate the binary preference p(i)
            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0

            # Calculate Ci and Ci - I
            CiI = sparse.diags(i_row, [0])
            Ci = CiI + X_I

            # Put it all together and compute the final formula
            xT_CiI_x = X.T @ CiI @ X
            xT_Ci_pi = X.T @ Ci @ p_i.T
            Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

    return X, Y

In [62]:
!dir

 Volume in drive C is OS
 Volume Serial Number is C4B8-271E

 Directory of C:\Users\Andrey\Desktop\NLA\Final\NLA-Final-Project\Final project

16.12.2021  00:54    <DIR>          .
16.12.2021  00:54    <DIR>          ..
15.12.2021  23:16    <DIR>          .ipynb_checkpoints
16.12.2021  00:54            12ÿ821 ALS.ipynb
15.12.2021  16:33    <DIR>          ml-100k
15.12.2021  16:33    <DIR>          ml-1m
15.12.2021  16:43            19ÿ132 Test-file.ipynb
15.12.2021  23:16             4ÿ468 Untitled.ipynb
               3 File(s)         36ÿ421 bytes
               5 Dir(s)  18ÿ055ÿ634ÿ944 bytes free


In [63]:
df_ratings = pd.read_csv('ml-1m/ratings.dat', sep="::", header=None)
#df_ratings.columns = "UserID::MovieID::Rating::Timestamp".split("::") 
df_ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df_ratings.head()

  return func(*args, **kwargs)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [64]:
df_ratings.drop('timestamp', axis='columns', inplace=True)
df_ratings.dropna(inplace=True)

вот тут неочевидная херня с movie_num<br>
movie_id - это айдишник, но некоторых не хватает, и поэтому последний (3900+) получается больше, чем длина матрицы (3700+), и чтобы херни не было, мы записываем в num айдишник айдишника

In [65]:
df_ratings['movie_num'] = df_ratings['movie_id'].astype("category").cat.codes
item_lookup = df_ratings[['movie_num', 'movie_id']].drop_duplicates()
item_lookup['movie_num'] = item_lookup.movie_num.astype(str)

In [66]:
users = list(np.sort(df_ratings.user_id.unique()))
movies = list(np.sort(df_ratings.movie_num.unique()))
ratings = list(df_ratings.rating)

In [67]:
rows = df_ratings.user_id.astype(int)
cols = df_ratings.movie_num.astype(int)

In [68]:
data_sparse = sparse.csr_matrix((ratings, (rows-1, cols)), shape=(len(users), len(movies)))

In [71]:
user_vecs, item_vecs = implicit_als(data_sparse, iterations=20, features=20, alpha_val=40)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [17:10<00:00, 51.53s/it]


In [72]:
user_vecs

<6040x20 sparse matrix of type '<class 'numpy.float64'>'
	with 120800 stored elements in Compressed Sparse Row format>

In [73]:
import json

In [78]:
sparse.save_npz('user_vecs.npz', user_vecs)
sparse.save_npz('item_vecs.npz', item_vecs)
# with open('item_vecs.json','w') as f:
#     json.dump(item_vecs, f)

In [83]:
def recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup, num_items=10):
    """Recommend items for a given user given a trained model
    
    Args:
        user_id (int): The id of the user we want to create recommendations for.
        
        data_sparse (csr_matrix): Our original training data.
        
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        item_lookup (pandas.DataFrame): Used to map artist ids to artist names
        
        num_items (int): How many recommendations we want to return:
        
    Returns:
        recommendations (pandas.DataFrame): DataFrame with num_items artist names and scores
    
    """
  
    # Get all interactions by the user
    user_interactions = data_sparse[user_id,:].toarray()

    # We don't want to recommend items the user has consumed. So let's
    # set them all to 0 and the unknowns to 1.
    user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
    user_interactions[user_interactions > 1] = 0

    # This is where we calculate the recommendation by taking the 
    # dot-product of the user vectors with the item vectors.
    rec_vector = (user_vecs[user_id,:] @ item_vecs.T).toarray()

    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1, 1))[:,0]
    recommend_vector = user_interactions  *rec_vector_scaled
   
    # Get all the artist indices in order of recommendations (descending) and
    # select only the top "num_items" items. 
    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    movies = []
    scores = []

    # Loop through our recommended artist indicies and look up the actial artist name
    for idx in item_idx:
        movies.append(item_lookup.movie_id.loc[item_lookup.movie_num == str(idx)].iloc[0])
        scores.append(recommend_vector[idx])

    # Create a new dataframe with recommended artist names and scores
    recommendations = pd.DataFrame({'movies': movies, 'score': scores})
    
    return recommendations

   movies     score
0    2111  1.000000
1    3536  0.975370
2    2195  0.971656
3    3189  0.966057
4    1425  0.963888
5    2978  0.960507
6    1963  0.957773
7    3326  0.954957
8     449  0.951931
9    2463  0.951266


In [None]:
# Let's generate and print our recommendations
user_id = 2023
recommendations = recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup)
print(recommendations)

In [89]:
df_ratings[df_ratings['user_id'] == 2023].sort_values('rating', ascending=False)[:10][]

Unnamed: 0,user_id,movie_id,rating,movie_num
344877,2023,1573,5,1445
344766,2023,1610,5,1478
344798,2023,231,5,224
344868,2023,3160,5,2944
344796,2023,223,5,216
344795,2023,3439,5,3207
344870,2023,1704,5,1563
344792,2023,3437,5,3205
344872,2023,3317,5,3091
344790,2023,1682,5,1545


In [95]:
recommendations['rating'] = recommendations['score'] * 5
recommendations.head()

Unnamed: 0,movies,score,rating
0,2111,1.0,5.0
1,3536,0.97537,4.87685
2,2195,0.971656,4.858281
3,3189,0.966057,4.830285
4,1425,0.963888,4.819438


In [105]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse

mae([5]*10, recommendations.rating)

0.17329775129668928

In [121]:
def test_for_user(user_id, item_num, err_function):
    """
    Estimate ALS accuracy
    
    Args:
        user_id (int) - user to predict movies to
        item_num(int) - number of items to predict
        err_function(function(list, list)->float) - a function to count an error, MAE, MSE, anything you like
    
    Returns: 
        mae error for als predictions
        length intersection of sets of movies user liked the most and the predicted ones
    """
    
    recommendations = recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup, num_items=item_num)
    recommendations['rating'] = recommendations['score'] * 5
    y_true = df_ratings[df_ratings['user_id'] == 2023].sort_values('rating', ascending=False)[:item_num]
    
    
    
    return [err_function(y_true['rating'], recommendations.rating), \
        len(set(y_true['movie_id']).intersection(recommendations['movies']))]

In [125]:
test_for_user(1517, 100, mse)

[0.08831605228206749, 32]

In [126]:
rand_users = random.sample(range(df_ratings.user_id.max()), 10)
print(rand_users)

[2610, 3972, 4797, 3418, 2625, 1398, 4132, 1680, 4344, 5075]


In [139]:
scores = []
for u in rand_users:
    scores.append(test_for_user(u, 250, mse))
scores
print('average err:', np.mean([s[0] for s in scores]))
print('average hits per 250:', np.mean([s[1] for s in scores]))

average err: 0.45542303278359625
average hits per 250: 70.5


In [140]:
70.5*100/250

28.2

In [None]:
percent of hits