In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD

# Reading dataset (MovieLens 1M movie ratings dataset: downloaded from https://grouplens.org/datasets/movielens/1m/)
data = pd.io.parsers.read_csv('data/ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::', encoding='ISO-8859-1')
movie_data = pd.io.parsers.read_csv('data/movies.dat',
    names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::', encoding='ISO-8859-1')

# Creating the rating matrix (rows as movies, columns as users)
ratings_mat = np.ndarray(
    shape=(np.max(data.movie_id.values), np.max(data.user_id.values)),
    dtype=np.uint8)
ratings_mat[data.movie_id.values-1, data.user_id.values-1] = data.rating.values

# Normalizing the matrix (subtract mean off)
mean_ratings = np.mean(ratings_mat, axis=1)
normalised_mat = ratings_mat - mean_ratings[:, np.newaxis]



# Performance Optimization: Using Truncated SVD
svd = TruncatedSVD(n_components=50)  # We can adjust n_components for better performance
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)  # Transpose for SVD
U = svd.fit_transform(A)  # This gives the lower-dimensional representation
S = svd.singular_values_
V = svd.components_  # This is the right singular vector matrix

# Function to find the movie_id from the movie name
def get_movie_id(movie_name):
    movie_id_row = movie_data[movie_data.title.str.contains(movie_name, case=False, na=False)]
    if not movie_id_row.empty:
        return movie_id_row.movie_id.values[0]  # Return the first match
    else:
        print("Movie not found!")
        return None

# Function to calculate the cosine similarity (sorting by most similar and returning the top N)
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1  # Movie id starts from 1 in the dataset
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    
    # Ensure magnitude values are not zero to prevent division errors
    if magnitude[index] == 0:
        print("Magnitude is zero for the movie, cannot compute similarity.")
        return []
    
    # Avoid division by zero for all magnitudes
    with np.errstate(divide='ignore', invalid='ignore'):
        similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    
    # Replace NaN with zeros for similarity values
    similarity = np.nan_to_num(similarity)
    
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]


# Function to print top N similar movies
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

# User Input Flexibility: Allowing user input for movie name
input_movie_name = input("Enter the movie name for recommendations: ") 
movie_id = get_movie_id(input_movie_name)

if movie_id is not None:  # Proceed only if a valid movie_id was found
    top_n = 10
    indexes = top_cosine_similarity(V.T, movie_id, top_n)  # Using V.T since we want the movie representations

    # Printing the top N similar movies
    print_similar_movies(movie_data, movie_id, indexes)


Recommendations for Shawshank Redemption, The (1994): 

Shawshank Redemption, The (1994)
Good Will Hunting (1997)
Silence of the Lambs, The (1991)
Dead Man Walking (1995)
Schindler's List (1993)
October Sky (1999)
Some Folks Call It a Sling Blade (1993)
Fall (1997)
Misérables, Les (1998)
Sling Blade (1996)


Calculating RMSE

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Binarize the ratings: Relevant (1) if rating >= threshold, else not relevant (0)
def binarize(matrix, threshold=4):
    return (matrix >= threshold).astype(int)

# Calculate metrics
def calculate_metrics(original_matrix, reconstructed_matrix, threshold=4):
    # Flatten both matrices to compare all entries
    original_flat = original_matrix.flatten()
    reconstructed_flat = reconstructed_matrix.flatten()
    
    # Filter only non-zero ratings in the original matrix
    non_zero_indices = original_flat > 0
    original_flat = original_flat[non_zero_indices]
    reconstructed_flat = reconstructed_flat[non_zero_indices]

    # Binarize the ratings
    original_binary = binarize(original_flat, threshold)
    reconstructed_binary = binarize(reconstructed_flat, threshold)

    # Calculate metrics
    error = np.mean(np.abs(original_flat - reconstructed_flat))  # Mean Absolute Error (MAE)
    accuracy = accuracy_score(original_binary, reconstructed_binary)
    precision = precision_score(original_binary, reconstructed_binary, zero_division=0)
    recall = recall_score(original_binary, reconstructed_binary, zero_division=0)
    
    return error, accuracy, precision, recall

# Reconstruct the ratings matrix
reconstructed_matrix = np.dot( U, np.dot(np.diag(S), V))

# Calculate metrics
error, accuracy, precision, recall = calculate_metrics(ratings_mat, reconstructed_matrix)

# Print results
print(f"Error (MAE): {error:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


NameError: name 'U' is not defined