# Recommendation System

---

In [1]:
# to auto reload any updated py files
%load_ext autoreload
%autoreload 2

1. Download dataset

In [2]:
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -q ml-100k.zip

--2025-06-07 03:15:52--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2025-06-07 03:15:52 (24.8 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [3]:
%%writefile data_loader.py

# load ratings & movie data
import pandas as pd


def load_ratings(path="ml-100k/u.data"):
  '''load file with user ratings'''
  return pd.read_csv(path, sep='\t', header=None,
                     names=["user_id", "movie_id", "rating", "timestamp"])

def load_movies(path='ml-100k/u.item'):
  '''load file with movie metadata'''
  return pd.read_csv(path, sep='|', encoding='latin-1', header=None,
                     names=["movie_id", "title", "release_date", "video_release_date",
                            "IMDb_URL"] + [f"genre_{i}" for i in range(19)])

def build_user_item_matrix(ratings_df):
  ''' pivot to user-item matrix with NaNs for missing values'''
  return ratings_df.pivot_table(index='user_id', columns='movie_id', values='rating')

def fill_missing_zero(matrix):
  '''filling NaNs with 0 for cosine similarity'''
  return matrix.fillna(0)

def center_ratings(matrix):
  '''returning mean centered ratings matrix (for pearson similarity)'''
  user_means = matrix.mean(axis=1)
  return matrix.sub(user_means, axis=0), user_means


Writing data_loader.py


In [4]:

#usage reference
from data_loader import load_ratings, load_movies, build_user_item_matrix, fill_missing_zero, center_ratings

ratings = load_ratings()
movies = load_movies()
user_item = build_user_item_matrix(ratings)
user_item_filled = fill_missing_zero(user_item)
user_item_centered, user_means = center_ratings(user_item)

In [5]:
# creating similarity.py

%%writefile similarity.py

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(matrix):
  ''' Computing the cosine similarity between users based on rating vectors.
      The input is a matrix with users as rows, movies as columns and no NaNs (filled with zeroes)'''

  similarity = cosine_similarity(matrix.values)
  return pd.DataFrame(similarity, index=matrix.index, columns=matrix.index)

def compute_pearson_similarity(centered_matrix):
    '''Computes the Pearson correlation between users on mean centered data..
       The input is a matrix with mean-centered ratings (NaNs allowed)'''
    return centered_matrix.T.corr(method='pearson')


def get_top_k_neighbors(similarity_matrix, user_id, k=5):
  ''' Get top k most similar users to a given user, excluding themselves.'''
  user_similarities = similarity_matrix.loc[user_id]
  top_k = user_similarities.drop(index=user_id).nlargest(k)
  return top_k


Writing similarity.py


In [6]:
# usage reference

from similarity import compute_cosine_similarity, compute_pearson_similarity, get_top_k_neighbors

# user cosine similarity
user_similarity_cosine = compute_cosine_similarity(user_item_filled)

# user pearson similarity
user_similarity_pearson = compute_pearson_similarity(user_item_centered)

# using pearson to get similar users to a certain other user
top_users = get_top_k_neighbors(user_similarity_pearson, user_id=1, k=5)

In [13]:
# creating predictor.py
%%writefile predictor.py

import numpy as np
from similarity import get_top_k_similar_users

def predict_rating_cosine(user_id, movie_id, rating_matrix, similarity_matrix):
  ''' preduct a user's rating for a movie'''
  if movie_id not in rating_matrix.columns:
    return np.nan

  movie_ratings = rating_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings > 0].index

  if user_id not in similarity_matrix.index:
    return np.nan


  similarities = similarity_matrix.loc[user_id, rated_users]
  ratings = movie_ratings[rated_users]

  numerator = np.dot(similarities, ratings)
  denominator = np.sum(np.abs(similarities))

  return numerator / denominator if denominator != 0 else np.nan


def predict_rating_pearson(user_id, movie_id, centered_matrix, similarity_matrix, user_means):
  ''' predict user's rating fora  movie using pearson & centered matrix'''
  if movie_id not in centered_matrix.columns or user_id not in similarity_matrix.index:
    return np.nan

  movie_ratings = centered_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings.notna()].index
  similarities = similarity_matrix.loc[user_id, rated_users]
  ratings = centered_matrix.loc[rated_users, movie_id]

  #filter out NaNs
  valid_mask = ratings.notna() & similarities.notna()
  similarities = similarities[valid_mask]
  ratings = ratings[valid_mask]

  #check denominator
  denominator = np.sum(np.abs(similarities))
  if len(similarities) ==0 or denominator == 0:
    return np.nan

  numerator = np.dot(similarities, ratings)
  return user_means.loc[user_id] + (numerator / denominator)


# addig top k neighbor filtering
def get_top_k_similar_users(user_id, similarity_matrix, k=10, min_similarity=0.0):
  ''' returning the top k most similar users to the target user'''
  if user_id not in similarity_matrix.index:
    return []

  similarities = similarity_matrix.loc[user_id].drop(user_id)
  similarities = similarities[similarities >= min_similarity]
  top_k = similarities.sort_values(ascending=False).head(k)
  return top_k.index

#predict rating top k cosine
def predict_rating_top_k_cosine(user_id, movie_id, rating_matrix, similarity_matrix, k=10, min_similarity=0.0):
  ''' predict rating using cosine similarity and top-k neighbors'''
  if movie_id not in rating_matrix.columns or user_id not in similarity_matrix.index:
    return np.nan

  movie_ratings = rating_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings > 0].index

  # finding overlap between rated users and top k similar ones
  top_k_users = get_top_k_similar_users(user_id, similarity_matrix, k, min_similarity)
  neighbors = [u for u in top_k_users if u in rated_users]

  if not neighbors:
    return np.nan

  similarities = similarity_matrix.loc[user_id, neighbors]
  ratings = rating_matrix.loc[neighbors, movie_id]

  numerator = np.dot(similarities, ratings)
  denominator = np.sum(np.abs(similarities))

  return numerator / denominator if denominator != 0 else np.nan


#predict rating top k pearson
def predict_rating_top_k_pearson(user_id, movie_id, centered_matrix, user_means, k=10, min_similarity=0.0):
  '''predict using pearson & top k neighbors'''
  if movie_id not in centered_matrix.columns or user_id not in similarity_matrix.index:
    return np.nan

  movie_ratings = centered_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings.notna()].index

  top_k_users = get_top_k_similar_users(user_id, similarity_matrix, k, min_similarity)
  neighbors = [u for u in top_k_users if u in rated_users]

  if not neighbors:
    return np.nan

  similarities = similarity_matrix.loc[user_id, neighbors]
  ratings = centered_matrix.loc[neighbors, movie_id]

  valid_mask = ratings.notna() & similarities.notna()
  similarities = similarities[valid_mask]
  ratings = ratings[valid_mask]

  denominator = np.sum(np.abs(similarities))
  if len(similarities) == 0 or denominator == 0:
    return np.nan

  numerator = np.dot(similarities,ratings)
  return user_means.loc[user_id] + (numerator / denominator)



Overwriting predictor.py


In [16]:
# update predictor with top k
from predictor import predict_rating_top_k_cosine, predict_rating_top_k_pearson


topk_cosine_pred = predict_rating_top_k_cosine(
    user_id=1,
    movie_id=50,
    rating_matrix= user_item_filled,
    similarity_matrix= user_similarity_cosine,
    k=30
)

topk_pearson_pred = predict_rating_top_k_pearson(
    user_id=1,
    movie_id=50,
    centered_matrix= user_item_centered,
    similarity_matrix= user_similarity_pearson,
    user_means=user_means,
    k=30
)

print(f"top k Cosine Prediction: {cosine_pred: .2f}")
print(f"top kPearson prediction: {pearson_pred:.2f}")


ImportError: cannot import name 'get_top_k_similar_users' from 'similarity' (/content/similarity.py)

In [None]:
# creating evaluator.py
%%writefile evaluator.py

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

def train_test_split(ratings_df, test_size=0.2, seed=42):
    """
    splitting ratings into train/test sets (randomly)
    """
    np.random.seed(seed)
    shuffled = ratings_df.sample(frac=1, random_state=seed)
    test_count = int(len(shuffled) * test_size)


    test_df = shuffled.iloc[:test_count]
    train_df = shuffled.iloc[test_count:]

    return train_df, test_df

def evaluate(predict_fn, test_df, *predict_args):
  ''' eval prediction function on test set '''

  y_true = []
  y_pred = []

  for row in test_df.itertuples():
    pred = predict_fn(row.user_id, row.movie_id, *predict_args)
    if not np.isnan(pred):
      y_true.append(row.rating)
      y_pred.append(pred)

  rmse= np.sqrt(mean_squared_error(y_true, y_pred))
  mae = mean_absolute_error(y_true, y_pred)
  return rmse, mae


Overwriting evaluator.py


In [None]:
# usage reference
from data_loader import build_user_item_matrix, fill_missing_zero, center_ratings
from similarity import compute_cosine_similarity, compute_pearson_similarity
from predictor import predict_rating_cosine, predict_rating_pearson
from evaluator import train_test_split, evaluate

In [None]:
# split the original ratings
train_df, test_df = train_test_split(ratings)

# build matrices from training data only
train_user_item = build_user_item_matrix(train_df)

train_user_item_filled = fill_missing_zero(train_user_item)

train_user_item_centered, train_user_means = center_ratings(train_user_item)

# get similarities from training data
user_sim_cosine = compute_cosine_similarity(train_user_item_filled)

user_sim_pearson = compute_pearson_similarity(train_user_item_centered)


In [None]:
# evaluate cosine

rmse_cos, mae_cos = evaluate(
    predict_rating_cosine,
    test_df,
    train_user_item_filled,
    user_sim_cosine
)

print(f"Cosine RMSE: {rmse_cos:4f}, Cosine MAE: {mae_cos:4f}")


Cosine RMSE: 1.014553, Cosine MAE: 0.806039


In [None]:
# evaluate pearson

rmse_pear, mae_pear = evaluate(
    predict_rating_pearson,
    test_df,
    train_user_item_centered,
    user_sim_pearson,
    train_user_means
)

print(f"Pearson RMSE: {rmse_pear:4f}, Pearson MAE:{mae_pear:4f}")

Pearson RMSE: 0.947520, Pearson MAE:0.747008
