In [1]:
# Task:
# An SVD Recommender that predicts the rating a user will give to a movie
# based on the user's own ratings and other users' rating data.

# Use only 'rating' as the data, avoid 'tags' and 'genre'

# 80/20, train/test split. Additionally, do a temporal split. 


In [16]:
# imports
import pandas as pd
from numpy.linalg import svd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
import math
import random
import copy

In [17]:
def draw_ascii_percentage_bar(value):
    filled_length = int(value *  100)
    empty_length = 100 - filled_length

    bar = '[' + '%' * filled_length + ']'
    #  + '_' * empty_length
    
    print(bar, end="")

In [18]:
# read data
ratings = 'data/movielens-latest-small/ratings.csv'

# to dataframes
df_ratings = pd.read_csv(ratings)

# inspect them
display('Ratings')
display(df_ratings.head())

'Ratings'

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [173]:
# 80/20, train/test split
df_ratings_x = df_ratings[['userId', 'movieId', 'timestamp']]
df_ratings_y = df_ratings[['rating', 'timestamp']]

x_train, x_test, y_train, y_test = train_test_split(df_ratings_x, df_ratings_y, test_size=0.2, random_state=1)
print(f"Training rows = {x_train.shape[0]}")
print(f"Testing rows = {x_test.shape[0]}")

print('x_train')
display(x_train.head())
print('x_test')
display(x_test.head())
print('y_train')
display(y_train.head())
print('y_test')
display(y_test.head())

# temporal split
tscv = TimeSeriesSplit(n_splits=2, test_size=20000)
for i, (train_index, test_index) in enumerate(tscv.split(x_train, y_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

print(f"Training rows temporal split = {train_index.shape[0]}")
print(f"Testing rows temporal split = {test_index.shape[0]}")



Training rows = 80668
Testing rows = 20168
x_train


Unnamed: 0,userId,movieId,timestamp
78834,489,3827,1333232046
7523,51,1639,1230929619
66122,425,5349,1106482551
69250,448,3793,1019125762
95606,600,3821,1237762470


x_test


Unnamed: 0,userId,movieId,timestamp
32974,225,380,949111600
83568,533,103688,1424754155
19537,125,149902,1476224175
38287,263,1183,941591735
59543,387,1982,1187927324


y_train


Unnamed: 0,rating,timestamp
78834,2.0,1333232046
7523,5.0,1230929619
66122,3.0,1106482551
69250,4.0,1019125762
95606,1.0,1237762470


y_test


Unnamed: 0,rating,timestamp
32974,3.0,949111600
83568,5.0,1424754155
19537,2.0,1476224175
38287,5.0,941591735
59543,3.0,1187927324


Fold 0:
  Train: index=[    0     1     2 ... 40665 40666 40667]
  Test:  index=[40668 40669 40670 ... 60665 60666 60667]
Fold 1:
  Train: index=[    0     1     2 ... 60665 60666 60667]
  Test:  index=[60668 60669 60670 ... 80665 80666 80667]
Training rows temporal split = 60668
Testing rows temporal split = 20000


Unnamed: 0,userId,movieId,rating,timestamp
78834,489,3827,2.0,1333232046
7523,51,1639,5.0,1230929619
66122,425,5349,3.0,1106482551
69250,448,3793,4.0,1019125762
95606,600,3821,1.0,1237762470


Unnamed: 0,userId,movieId,rating,timestamp
32974,225,380,3.0,949111600
83568,533,103688,5.0,1424754155
19537,125,149902,2.0,1476224175
38287,263,1183,5.0,941591735
59543,387,1982,3.0,1187927324


In [19]:
# 80/20, train/test split
df_train, df_test = train_test_split(df_ratings, test_size=0.2, random_state=1)
display(df_train.head())
display(df_test.head())

all_movie_ids = np.sort(df_ratings['movieId'].unique())

display(all_movie_ids)
display(all_movie_ids.shape)

Unnamed: 0,userId,movieId,rating,timestamp
78834,489,3827,2.0,1333232046
7523,51,1639,5.0,1230929619
66122,425,5349,3.0,1106482551
69250,448,3793,4.0,1019125762
95606,600,3821,1.0,1237762470


Unnamed: 0,userId,movieId,rating,timestamp
32974,225,380,3.0,949111600
83568,533,103688,5.0,1424754155
19537,125,149902,2.0,1476224175
38287,263,1183,5.0,941591735
59543,387,1982,3.0,1187927324


array([     1,      2,      3, ..., 193585, 193587, 193609], dtype=int64)

(9724,)

In [20]:
# Consider reviews from users with more than 50 reviews
#usercount = df_ratings[['movieId','userId']].groupby('userId').count()
#display(usercount.head())

In [21]:
# Build a pivot table with movieIds as columns 
# and users as rows

def create_pivot_from_df(df, index, columns, values, reindex_columns):
    # Create the pivot matrix with the training data
    rating_matrix = df.pivot(index=index, columns=columns, values=values)

    # Reindex the columns of the pivot matrix to include all movie IDs
    rating_matrix = rating_matrix.reindex(columns=reindex_columns, fill_value=0)

    # Fill NaN values with 0
    rating_matrix = rating_matrix.fillna(0)
    
    display(rating_matrix.head())
    matrix = rating_matrix.values
    matrix = np.matrix(matrix)
    return matrix

matrix = create_pivot_from_df(df_train, "userId", "movieId", "rating", all_movie_ids)
test_matrix = create_pivot_from_df(df_test, "userId", "movieId", "rating", all_movie_ids)

display(matrix)
display(test_matrix)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0


matrix([[4., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 2., 2., ..., 0., 0., 0.],
        [3., 0., 0., ..., 0., 0., 0.],
        [5., 0., 0., ..., 0., 0., 0.]])

matrix([[0. , 0. , 4. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        ...,
        [2.5, 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [22]:
# Singular value decomposition. Source: https://www.kaggle.com/code/vincentman0403/recommendation-example-by-svd
U, S, VT = np.linalg.svd(matrix, full_matrices=False)

V = VT.T
Sigma = np.diag(S)
print('matrix shape = ', matrix.shape)
print('U shape = ', U.shape)
print('VT shape = ', VT.shape)
print('S shape = ', S.shape)
print('Sigma = \n', Sigma)

# Use first 2 singular values
r = 2

# Get approximate U, Sigma, VT
Ur = U[:, :r]
Sr = Sigma[:r, :r]
Vr = V[:, :r]
print('Ur(matrix of user to latent factor), shape = ', Ur.shape)
print('Sr(matrix of singular values), shape = ', Sr.shape)
print('Vr(matrix of item to latent factor), shape = ', Vr.shape)

Sr = Sigma[:r, :r]
Vr = V[:, :r]

matrix shape =  (610, 9724)
U shape =  (610, 610)
VT shape =  (610, 9724)
S shape =  (610,)
Sigma = 
 [[430.73584462   0.           0.         ...   0.           0.
    0.        ]
 [  0.         189.33366799   0.         ...   0.           0.
    0.        ]
 [  0.           0.         156.70901141 ...   0.           0.
    0.        ]
 ...
 [  0.           0.           0.         ...   3.43523313   0.
    0.        ]
 [  0.           0.           0.         ...   0.           3.2973898
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    3.06286843]]
Ur(matrix of user to latent factor), shape =  (610, 2)
Sr(matrix of singular values), shape =  (2, 2)
Vr(matrix of item to latent factor), shape =  (9724, 2)


In [23]:
eps = 1.0e-6

def cosine_similarity(v,u):
    return (v @ u.T)/ (np.linalg.norm(v) * np.linalg.norm(u) + eps)

def rmse_function(v, u):
    return np.sqrt(np.mean((v - u) ** 2))
    
def squared_distance(v, u):
    v = np.array(v)
    u = np.array(u)
    return np.sum((v[0] - u[0]) ** 2)

def manhattan_distance(v, u):
    return np.sum(np.abs(v - u))

In [24]:
# K-means RMSE Clustering 
# select k random data points (user indexes in u) as initial cluster centers C_1, ..., C_k

def initialize_centroids(data_points, k, seed = None):
    if seed is not None:
        random.seed(seed)
        
    # Select k random indexes from the data points list
    centroid_indexes = random.sample(range(data_points.shape[0]), k)
    
    # Construct the list of centroids using the selected indexes
    centroids = [copy.deepcopy(data_points[i]) for i in centroid_indexes]
    
    return centroids

def assign_point_to_centroid(user_index, point, centroid_index, assignments):
    if centroid_index not in assignments:
        assignments[centroid_index] = [(user_index, point)]
    else:
        assignments[centroid_index].append((user_index, point))

def update_centroid_position(centroid, new_position):
    centroid[:] = new_position
    
def average_point(points):
    # Extract the points (user points) from the tuples
    user_points = [point[1] for point in points]
    
    # Convert the list of points to a NumPy array
    points_array = np.array(user_points)

    # Compute the mean along each dimension
    average = np.mean(points_array, axis=0)
    
    return average

def k_means(u, k, max_iterations, seed, threshold):
    assignments = {}
    num_datapoints = u.shape[0]

    centroids = initialize_centroids(u, k, seed)
    
    for iteration in range(1, max_iterations):
        assignments = {}
        
        for index, _ in enumerate(centroids):
            assignments[index] = []
            
        non_converged_centroids = copy.deepcopy(k)
        
        # for each p (user) in u, map p_i to its nearest cluster center C_j 
        for user_index, p in enumerate(u):
            # Find the closest centroid to the current data point            
            closest_centroid_index, _ = max(enumerate(centroids), key=lambda c: cosine_similarity(p, c[1]))
            
            assign_point_to_centroid(user_index, p, closest_centroid_index, assignments)
        
        for index, centroid in enumerate(centroids):
            # Update centroid position by taking the mean of assigned data points
            
            assigned_points = assignments[index]  # get all data points assigned to a centroid
                
            if assigned_points:
                new_position = average_point(assigned_points)
                 # calculate Euclidean distance

                distance = np.linalg.norm(centroid - new_position)
                
                if distance < threshold:
                    non_converged_centroids -= 1

                update_centroid_position(centroid, new_position)
            
        if (non_converged_centroids == 0):
            print("iter: ", iteration, " ******* CONVERGED ******* ", end="")
            break
    
    total_rmse = 0
    for centroid_index, data_points in assignments.items():
        squared_distances = 0

        for point_index, point in enumerate(data_points):
            # calculate squared distancs of each point to its centroid
            squared_distances += squared_distance(point[1], centroids[centroid_index])
        
        # divide the sum of squared distances with the number of points in the centroid
        mse = squared_distances / len(centroids[centroid_index])

        # take the square root of mse to get rmse
        rmse = np.sqrt(mse)
        total_rmse += rmse
    
    # Calculate the average RMSE
    average_rmse = total_rmse / len(assignments)
    print("average_rmse: ", average_rmse)
    
    centroids_and_assignments = np.array([(centroids[i], assignments[i]) for i in range(len(centroids))], dtype=object)

    return centroids_and_assignments

In [25]:
max_iterations = 50
threshold = 1e-5

# use specific start conditions
k = 7
seed = 0

centroids_and_assignments = k_means(Ur, k, max_iterations, seed, threshold)

iter:  34  ******* CONVERGED ******* average_rmse:  0.3735153729727942


In [26]:
# Generate similarities to all others users for a single user
def get_similarities(target_point, neighbors):
    similarities = []

    for index, neighbor in enumerate(neighbors):
        similarity = cosine_similarity(target_point, neighbor[1])
        similarities.append((neighbor, similarity))
    
    return similarities

def predict_rating(centroid_and_assignment, movie_index, ratings_matrix, k=5):
    
    # Calculate cosine similarity between the centroid and its neighbors
    similarities = get_similarities(centroid_and_assignment[0], centroid_and_assignment[1])

    # Calculate weighted average of ratings from nearest neighbors
    weighted_ratings = 0
    total_similarity = 0
    for neighbor_index, similarity in enumerate(similarities):

        neighbor_similarity = similarity[1]
        neighbor_rating = ratings_matrix[similarity[0][0], movie_index]
        
        if neighbor_rating != 0:  # Ignore if neighbor hasn't rated the movie
            weighted_ratings += neighbor_similarity * neighbor_rating
            total_similarity += neighbor_similarity
            
            #print("neighbor_similarity: ", neighbor_similarity)
            #print("neighbor_rating: ", neighbor_rating)
            #print("weighted rating: ", neighbor_similarity * neighbor_rating)
            #print("total_similarity: ", total_similarity)
            #print()
            #break
    
    # Predict the rating for the target user
    if total_similarity != 0:
        predicted_rating = weighted_ratings / total_similarity
    else:
        predicted_rating = 0  # In case none of the nearest neighbors have rated the movie
    
    return float(predicted_rating)

In [27]:
# Evaluate predictions

# for each user, predict rating for every movie

# first, find closest centroid
# then give prediction based on the centroid

# check centroid's distance from actual, if review exists

def find_closest_centroid(data_point, centroids):
    closest_centroid_distance = np.inf
    closest_centroid_index = None
    
    for centroid_index, centroid in enumerate(centroids):
        distance = squared_distance(data_point, centroid)

        if distance < closest_centroid_distance:
            closest_centroid_distance = distance
            closest_centroid_index = centroid_index
            
    return closest_centroid_index


In [59]:
# evaluate 
def evaluate_set(set, centroids_and_assignments):
    total_error = 0
    movie_index = 0 
    rating_array = np.array(rating_matrix)
    
    user_total_squared_error = 0
    user_evaluations = 0
    
    centroids = [centroid for centroid, _ in centroids_and_assignments]
    
    for p_index, p in enumerate(set):

        p_result = p * Vr * np.linalg.inv(Sr)
        #print('Vector of new user to latent factor = ', p_result)
        
        closest_centroid_index = find_closest_centroid(p_result, centroids)
        #print("closest_centroid_index:", closest_centroid_index)

        predicted_rating = predict_rating(centroids_and_assignments[closest_centroid_index], movie_index = movie_index, ratings_matrix = matrix)
        #print("Predicted rating:", predicted_rating)
        
        actual_rating = rating_array[p_index][movie_index]
        #print("actual_rating:", actual_rating)
        
        if actual_rating != 0:
            user_evaluations += 1
            user_total_squared_error += (predicted_rating - actual_rating)**2
            
    mse = user_total_squared_error / user_evaluations
    
    rmse = np.sqrt(mse)
    
    print("rmse:", rmse)
    print("User evaluations:", user_evaluations)
            
evaluate_set(matrix, centroids_and_assignments)
evaluate_set(test_matrix, centroids_and_assignments)

rmse: 0.8412821610950078
User evaluations: 171
rmse: 0.8544288294656619
User evaluations: 170


In [165]:

def evaluate_one():
    # Vector of new user to review
    #random.seed()
    new = np.full((1, matrix.shape[1]), 0)
    new = np.matrix(new)

    newresult = new * Vr * np.linalg.inv(Sr)
    print('Vector of new user to latent factor = ', newresult)
    
    centroids = [centroid for centroid, _ in centroids_and_assignments]

    closest_centroid_index = find_closest_centroid(newresult, centroids)
    print("closest_centroid_index:", closest_centroid_index)

    predicted_rating = predict_rating(centroids_and_assignments[closest_centroid_index], movie_index = 0, ratings_matrix = matrix)
    print("Predicted rating:", predicted_rating)    

evaluate_one()

Vector of new user to latent factor =  [[0. 0.]]


[matrix([[-0.02903582, -0.00939806]]),
 matrix([[-0.01701947, -0.03251141]]),
 matrix([[-0.04072543,  0.00136136]]),
 matrix([[-0.03797469,  0.02126807]]),
 matrix([[-0.02161634, -0.02758304]]),
 matrix([[-0.02859334, -0.02220109]]),
 matrix([[-0.02863435,  0.04412635]])]

closest_centroid_index: 0
Predicted rating: 3.834541041428917


In [None]:
max_iterations = 50
threshold = 1e-5

# Find converging start conditions, and lowest rmse
for seed in range(0, 1):
    print("seed:",seed," ",end="")
    
    for k in range(3, 10): 
        print("k:",k," ",end="")
        k_means(u, k, max_iterations, seed, threshold)