In [1]:
# Task:
# An SVD Recommender that predicts the rating a user will give to a movie
# based on the user's own ratings and other users' rating data.

# Use only 'rating' as the data, avoid 'tags' and 'genre'

# 80/20, train/test split. Additionally, do a temporal split. 


In [20]:
# imports
import pandas as pd
from numpy.linalg import svd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
import math
import random
import copy

In [55]:
def draw_ascii_percentage_bar(value):
    filled_length = int(value *  100)
    empty_length = 100 - filled_length

    bar = '[' + '%' * filled_length + ']'
    #  + '_' * empty_length
    
    print(bar, end="")

In [4]:
# read data
movies = 'data/movielens-latest-small/movies.csv'
ratings = 'data/movielens-latest-small/ratings.csv'

# to dataframes
df_movies = pd.read_csv(movies)
df_ratings = pd.read_csv(ratings)

# inspect them
display('Movies')
display(df_movies.head())
display('Ratings')
display(df_ratings.head())

'Movies'

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


'Ratings'

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# 80/20, train/test split
df_ratings_x = df_ratings[['userId', 'movieId', 'timestamp']]
df_ratings_y = df_ratings[['rating', 'timestamp']]

x_train, x_test, y_train, y_test = train_test_split(df_ratings_x, df_ratings_y, test_size=0.2, random_state=1)
print(f"Training rows = {x_train.shape[0]}")
print(f"Testing rows = {x_test.shape[0]}")

#display(x_train.head())
#display(x_test.head())
#display(y_train.head())
#display(y_test.head())

# temporal split
tscv = TimeSeriesSplit(n_splits=2, test_size=20000)
for i, (train_index, test_index) in enumerate(tscv.split(x_train, y_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

print(f"Training rows temporal split = {train_index.shape[0]}")
print(f"Testing rows temporal split = {test_index.shape[0]}")

Training rows = 80668
Testing rows = 20168
Fold 0:
  Train: index=[    0     1     2 ... 40665 40666 40667]
  Test:  index=[40668 40669 40670 ... 60665 60666 60667]
Fold 1:
  Train: index=[    0     1     2 ... 60665 60666 60667]
  Test:  index=[60668 60669 60670 ... 80665 80666 80667]
Training rows temporal split = 60668
Testing rows temporal split = 20000


In [6]:
# Consider reviews from users with more than 50 reviews
#usercount = df_ratings[['movieId','userId']].groupby('userId').count()
#display(usercount.head())

In [7]:
# Source for SVD stuff: https://machinelearningmastery.com/using-singular-value-decomposition-to-build-a-recommender-system/
# Build a pivot table with movieIds as columns 
# and users and their ratings as rows
rating_matrix = df_ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)
display(rating_matrix.head())
matrix = rating_matrix.values
display(matrix)


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [8]:
# Singular value decomposition
u, s, vh = np.linalg.svd(matrix, full_matrices=False)
# We know that the columns of vh are movies
# The rows of u are users

In [179]:
def cosine_similarity(v,u):
    return (v @ u)/ (np.linalg.norm(v) * np.linalg.norm(u))

def rmse_function(v, u):
    return np.sqrt(np.mean((v - u) ** 2))

def squared_distance(v, u):
    return np.sum((v - u) ** 2)

In [225]:
# K-means RMSE Clustering 
# select k random data points (user indexes in u) as initial cluster centers C_1, ..., C_k

def initialize_centroids(data_points, k, seed = None):
    if seed is not None:
        random.seed(seed)
        
    # Select k random indexes from the data points list
    centroid_indexes = random.sample(range(data_points.shape[0]), k)
    
    # Construct the list of centroids using the selected indexes
    centroids = [copy.deepcopy(data_points[i]) for i in centroid_indexes]
    
    return centroids

def assign_point_to_centroid(point, centroid_index, assignments):
    if centroid_index not in assignments:
        assignments[centroid_index] = [point]
    else:
        assignments[centroid_index].append(point)

def update_centroid_position(centroid, new_position):
    centroid[:] = new_position

def k_means(u, k, max_iterations, seed, threshold):
    assignments = {}
    num_datapoints = u.shape[0]
    
    centroids = initialize_centroids(u, k, seed)

    for iteration in range(1, max_iterations):
        #print()
        #print(" Iteration ", i, end="")

        assignments = {}
        non_converged_centroids = k
        
        # for each p (user) in u, map p_i to its nearest cluster center C_j 
        for i, p in enumerate(u):
            # Find the closest centroid to the current data point
            #closest_centroid_index, _ = max(enumerate(centroids), key=lambda c: cosine_similarity(p, c[1]))
            closest_centroid_index, _ = min(enumerate(centroids), key=lambda c: rmse_function(p, c[1]))

            assign_point_to_centroid(p, closest_centroid_index, assignments)
                
        for index, centroid in enumerate(centroids):
            # Update centroid position by taking the mean of assigned data points
            
            assigned_points = assignments[index]  # get all data points assigned to a centroid
            if assigned_points:
                
                #draw_ascii_percentage_bar(len(assigned_points) / num_datapoints)
                #print(" C", index, ", ", len(assigned_points), " points")
                                
                new_position = np.mean(assigned_points)
                 # calculate Euclidean distance
                distance = np.linalg.norm(centroid - new_position)
                
                if distance < threshold:
                    non_converged_centroids -= 1

                update_centroid_position(centroid, new_position)
            
        if (non_converged_centroids == 0):
            print("iter: ", iteration, " ******* CONVERGED ******* ", end="")
            break
    
    total_rmse = 0
    for centroid_index, data_points in assignments.items():
        squared_distances = 0

        for point_index, point in enumerate(data_points):
            # calculate squared distancs of each point to its centroid
            squared_distances += squared_distance(point, centroids[centroid_index])
        
        # divide the sum of squared distances with the number of points in the centroid
        mse = squared_distances / len(centroids[centroid_index])

        # take the square root of mse to get rmse
        rmse = np.sqrt(mse)
        total_rmse += rmse
    
    # Calculate the average RMSE
    average_rmse = total_rmse / len(assignments)
    print("average_rmse: ", average_rmse)
    
    return centroids

In [230]:
max_iterations = 50
threshold = 1e-5

# Find converging start conditions, and lowest rmse
for seed in range(0, 1):
    print("seed:",seed," ",end="")
    
    for k in range(3, 10): 
        print("k:",k," ",end="")
        k_means(u, k, max_iterations, seed, threshold)

seed: 0  k: 3  iter:  14  ******* CONVERGED ******* average_rmse:  0.5716044437630686
k: 4  iter:  23  ******* CONVERGED ******* average_rmse:  0.4907281521163362
k: 5  iter:  14  ******* CONVERGED ******* average_rmse:  0.4379535751328219
k: 6  iter:  18  ******* CONVERGED ******* average_rmse:  0.40126028937180963
k: 7  iter:  31  ******* CONVERGED ******* average_rmse:  0.3695225069029808
k: 8  iter:  21  ******* CONVERGED ******* average_rmse:  0.34903025500229257
k: 9  iter:  40  ******* CONVERGED ******* average_rmse:  0.3254265732691311


In [231]:
max_iterations = 50
threshold = 1e-5

# use specific start conditions
k = 7
seed = 0
centroids = k_means(u, k, max_iterations, seed, threshold)


iter:  31  ******* CONVERGED ******* average_rmse:  0.3695225069029808


In [None]:
# Evaluate predictions

# for each user, predict rating for every movie

# first, find closest centroid
# then give prediction based on the centroid

# check centroid's distance from actual, if review exists

def find_closest_centroid(data_point, centroids):
    closest_centroid_distance = pd.inf
    closest_centroid_index = None
    
    for centroid_index, centroid in enumerate(centroids):
        distance = squared_distance(data_point, centroid)
        if distance < closest_centroid_distance:
            closest_centroid_distance = distance
            closest_centroid_index = centroid_index
            
    return closest_centroid_index

for row in range(0, u.shape[0]):
    closest_centroid = find_closest_centroid(u[row])
    
    for col in range(0, u.shape[1]):
        


In [None]:
# Evaluate predictions

# for each user, predict rating for every movie
# check distance from actual, if review exists
total_error = 0

for row in range(0, u.shape[0]):
    user_total_error_squared = 0
    user_evaluations = 0
    
    for col in range(0, vh.shape[1]):
        predicted_rating = predict_rating(user_index = row, movie_index = col, ratings_matrix = matrix, u = u)
        actual_rating = matrix[row][col]
        if actual_rating != 0:
            user_evaluations += 1
            user_total_error_squared += (predicted_rating - actual_rating)**2
    
    user_total_error = math.sqrt(user_total_error_squared)
    print("User total error:", user_total_error)
    print("User evaluations:", user_evaluations)
    
print("Total error:", total_error)

In [None]:
# Generate similarities to all others users for a single user
def get_similarities(user_index, u):
    similarities = []
    
    for row in range(0, u.shape[0]):
        #if row != user_index:
        similarity  = cosine_similarity(u[user_index, :], u[row, :])
        similarities.append(similarity)
    
    return similarities

similarities = get_similarities(0, u)
#display(similarities)

# Predict a rating using the U matrix from the SVD operation
def predict_rating(user_index, movie_index, ratings_matrix, u, k=5):
    # Get the ratings for the target user and the movie
    target_user_ratings = ratings_matrix[user_index, :]
    movie_ratings = ratings_matrix[:, movie_index]

    # Calculate cosine similarity between the target user and all other users
    # 1-d matrix of length users
    similarities  = get_similarities(user_index, u)
    
    # Sort the similarities and get the indices of the nearest neighbors
    nearest_neighbors = np.argsort(similarities)[::-1][1::]  # Exclude the target user
    
    # Calculate weighted average of ratings from nearest neighbors
    weighted_ratings = 0
    total_similarity = 0
    for neighbor_index in nearest_neighbors:
        neighbor_similarity = similarities[neighbor_index]
        neighbor_rating = ratings_matrix[neighbor_index, movie_index]
        if neighbor_rating != 0:  # Ignore if neighbor hasn't rated the movie
            weighted_ratings += neighbor_similarity * neighbor_rating
            total_similarity += neighbor_similarity
    
    # Predict the rating for the target user
    if total_similarity != 0:
        predicted_rating = weighted_ratings / total_similarity
    else:
        predicted_rating = 0  # In case none of the nearest neighbors have rated the movie
    
    return predicted_rating

predicted_rating = predict_rating(user_index = 0, movie_index = 0, ratings_matrix = matrix, u = u)
print("Predicted rating:", predicted_rating)

User total error: 16.05560880831214
User evaluations: 42


KeyboardInterrupt: 

In [None]:
# Find the highest similarity

lowest_similarity = np.inf
highest_similarity = -np.inf
highest_sim_col = -1
for col in range(1,vh.shape[1]):
    similarity = cosine_similarity(vh[:,0], vh[:,col])
    if similarity > highest_similarity:
        highest_similarity = similarity
        highest_sim_col = col
    if similarity < lowest_similarity:
        lowest_similarity = similarity

print("highest_similarity is %s" % (highest_similarity))
print("lowest_similarity is %s" % (lowest_similarity))

print("Column %d (movie id %s) is most similar to column 0 (movie id %s)" %
        (highest_sim_col, rating_matrix.columns[col], rating_matrix.columns[0])
)

In [None]:
# Predict a rating
def predict_rating(user_id, movie_id, ratings_matrix, k=5):
    # Get the ratings for the target user and the movie
    target_user_ratings = ratings_matrix[user_id, :]
    movie_ratings = ratings_matrix[:, movie_id]
    
    predicted_rating = 0
    total_similarity = 0
    
     # Calculate cosine similarity between the target user and all other users
    for i, user_ratings in enumerate(ratings_matrix):
        if i == user_id:
            continue  # Skip the target user
        
        similarity = cosine_similarity(target_user_ratings, user_ratings)
        neighbor_rating = ratings_matrix[i, movie_id]
        
        if neighbor_rating != 0:  # Ignore if neighbor hasn't rated the movie
            predicted_rating += similarity * neighbor_rating
            total_similarity += similarity

    if total_similarity != 0:
        predicted_rating /= total_similarity
    
    return predicted_rating

# Example usage
# Assuming ratings_matrix is your pivot matrix where rows represent users and columns represent movies
# and user_id and movie_id are valid indices in your matrix
predicted_rating = predict_rating(user_id = 0, movie_id = 0, ratings_matrix = matrix)
print("Predicted rating:", predicted_rating)