# Recommendation System

---

In [1]:
# to auto reload any updated py files
%load_ext autoreload
%autoreload 2

1. Download dataset

In [2]:
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -q ml-100k.zip

--2025-06-11 18:23:35--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2025-06-11 18:23:36 (10.7 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [3]:
%%writefile data_loader.py

# load ratings & movie data
import pandas as pd


def load_ratings(path="ml-100k/u.data"):
  '''load file with user ratings'''
  return pd.read_csv(path, sep='\t', header=None,
                     names=["user_id", "movie_id", "rating", "timestamp"])

def load_movies(path='ml-100k/u.item'):
  '''load file with movie metadata'''
  return pd.read_csv(path, sep='|', encoding='latin-1', header=None,
                     names=["movie_id", "title", "release_date", "video_release_date",
                            "IMDb_URL"] + [f"genre_{i}" for i in range(19)])

def build_user_item_matrix(ratings_df):
  ''' pivot to user-item matrix with NaNs for missing values'''
  return ratings_df.pivot_table(index='user_id', columns='movie_id', values='rating')

def fill_missing_zero(matrix):
  '''filling NaNs with 0 for cosine similarity'''
  return matrix.fillna(0)

def center_ratings(matrix):
  '''returning mean centered ratings matrix (for pearson similarity)'''
  user_means = matrix.mean(axis=1)
  return matrix.sub(user_means, axis=0), user_means


Writing data_loader.py


In [4]:

#usage reference
from data_loader import load_ratings, load_movies, build_user_item_matrix, fill_missing_zero, center_ratings

ratings = load_ratings()
movies = load_movies()
user_item = build_user_item_matrix(ratings)
user_item_filled = fill_missing_zero(user_item)
user_item_centered, user_means = center_ratings(user_item)

In [5]:
# creating similarity.py

%%writefile similarity.py

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(matrix):
  ''' Computing the cosine similarity between users based on rating vectors.
      The input is a matrix with users as rows, movies as columns and no NaNs (filled with zeroes)'''

  similarity = cosine_similarity(matrix.values)
  return pd.DataFrame(similarity, index=matrix.index, columns=matrix.index)

def compute_pearson_similarity(centered_matrix):
    '''Computes the Pearson correlation between users on mean centered data..
       The input is a matrix with mean-centered ratings (NaNs allowed)'''
    return centered_matrix.T.corr(method='pearson')


def get_top_k_neighbors(similarity_matrix, user_id, k=5):
  ''' Get top k most similar users to a given user, excluding themselves.'''
  user_similarities = similarity_matrix.loc[user_id]
  top_k = user_similarities.drop(index=user_id).nlargest(k)
  return top_k


Writing similarity.py


In [6]:
# usage reference

from similarity import compute_cosine_similarity, compute_pearson_similarity, get_top_k_neighbors

# user cosine similarity
user_similarity_cosine = compute_cosine_similarity(user_item_filled)

# user pearson similarity
user_similarity_pearson = compute_pearson_similarity(user_item_centered)

# using pearson to get similar users to a certain other user
top_users = get_top_k_neighbors(user_similarity_pearson, user_id=1, k=5)

In [7]:
# creating predictor.py
%%writefile predictor.py

import numpy as np

def predict_rating_cosine(user_id, movie_id, rating_matrix, similarity_matrix):
  ''' preduct a user's rating for a movie'''
  if movie_id not in rating_matrix.columns:
    return np.nan

  movie_ratings = rating_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings > 0].index

  if user_id not in similarity_matrix.index:
    return np.nan


  similarities = similarity_matrix.loc[user_id, rated_users]
  ratings = movie_ratings[rated_users]

  numerator = np.dot(similarities, ratings)
  denominator = np.sum(np.abs(similarities))

  return numerator / denominator if denominator != 0 else np.nan


def predict_rating_pearson(user_id, movie_id, centered_matrix, similarity_matrix, user_means):
  ''' predict user's rating fora  movie using pearson & centered matrix'''
  if movie_id not in centered_matrix.columns or user_id not in similarity_matrix.index:
    return np.nan

  movie_ratings = centered_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings.notna()].index
  similarities = similarity_matrix.loc[user_id, rated_users]
  ratings = centered_matrix.loc[rated_users, movie_id]

  #filter out NaNs
  valid_mask = ratings.notna() & similarities.notna()
  similarities = similarities[valid_mask]
  ratings = ratings[valid_mask]

  #check denominator
  denominator = np.sum(np.abs(similarities))
  if len(similarities) ==0 or denominator == 0:
    return np.nan

  numerator = np.dot(similarities, ratings)
  return user_means.loc[user_id] + (numerator / denominator)


# addig top k neighbor filtering
def get_top_k_similar_users(user_id, similarity_matrix, k=10, min_similarity=0.0):
  ''' returning the top k most similar users to the target user'''
  if user_id not in similarity_matrix.index:
    return []

  similarities = similarity_matrix.loc[user_id].drop(user_id)
  similarities = similarities[similarities >= min_similarity]
  top_k = similarities.sort_values(ascending=False).head(k)
  return top_k.index

#predict rating top k cosine
def predict_rating_top_k_cosine(user_id, movie_id, rating_matrix, similarity_matrix, k=10, min_similarity=0.0):
  ''' predict rating using cosine similarity and top-k neighbors'''
  if movie_id not in rating_matrix.columns or user_id not in similarity_matrix.index:
    return np.nan

  movie_ratings = rating_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings > 0].index

  # finding overlap between rated users and top k similar ones
  top_k_users = get_top_k_similar_users(user_id, similarity_matrix, k, min_similarity)
  neighbors = [u for u in top_k_users if u in rated_users]

  if not neighbors:
    return np.nan

  similarities = similarity_matrix.loc[user_id, neighbors]
  ratings = rating_matrix.loc[neighbors, movie_id]

  numerator = np.dot(similarities, ratings)
  denominator = np.sum(np.abs(similarities))

  return numerator / denominator if denominator != 0 else np.nan


#predict rating top k pearson
def predict_rating_top_k_pearson(user_id, movie_id, centered_matrix, similarity_matrix, user_means, k=10, min_similarity=0.0):
  '''predict using pearson & top k neighbors'''
  if movie_id not in centered_matrix.columns or user_id not in similarity_matrix.index:
    return np.nan

  movie_ratings = centered_matrix[movie_id]
  rated_users = movie_ratings[movie_ratings.notna()].index

  top_k_users = get_top_k_similar_users(user_id, similarity_matrix, k, min_similarity)
  neighbors = [u for u in top_k_users if u in rated_users]

  if not neighbors:
    return np.nan

  similarities = similarity_matrix.loc[user_id, neighbors]
  ratings = centered_matrix.loc[neighbors, movie_id]

  valid_mask = ratings.notna() & similarities.notna()
  similarities = similarities[valid_mask]
  ratings = ratings[valid_mask]

  denominator = np.sum(np.abs(similarities))
  if len(similarities) == 0 or denominator == 0:
    return np.nan

  numerator = np.dot(similarities,ratings)
  return user_means.loc[user_id] + (numerator / denominator)



Writing predictor.py


In [8]:
# update predictor with top k
from predictor import predict_rating_top_k_cosine, predict_rating_top_k_pearson


topk_cosine_pred = predict_rating_top_k_cosine(
    user_id=1,
    movie_id=50,
    rating_matrix= user_item_filled,
    similarity_matrix= user_similarity_cosine,
    k=30
)

topk_pearson_pred = predict_rating_top_k_pearson(
    user_id=1,
    movie_id=50,
    centered_matrix= user_item_centered,
    similarity_matrix= user_similarity_pearson,
    user_means=user_means,
    k=30
)

print(f"top k Cosine Prediction: {topk_cosine_pred: .2f}")
print(f"top kPearson prediction: {topk_pearson_pred:.2f}")


top k Cosine Prediction:  4.74
top kPearson prediction: 4.10


In [9]:
# after implementing top k predictions:
# the results now show a refined memory based collaborative filtering
# by using only the top-k most similar users(instead of using all of them)
# making prediction less noisy and more realistic than the previous one

# get top k similar users - selects k most sim. users to targer user(using cos or pears)
# with similarity threshold

# predict_rating_top_k - uses solely those top-k users when predicting a rating, avoiding. weak/noisy similarities.


#full cosine similarity - 4.40
#full pearson similarity - 4.48
# including all users even noisy/unrelated ones

#top k cosine (k=30) - 4.74| recommendation is more confident
#top k pearson (k=30) - 4.10| rec is more conservative

#metrics. such as Precision@K and Recall@K
# precision@ k - from K recommended movies, how many were actually liked?
# recall@ k - of all movies that the user liked, how many did we recommend in top K?

#other model based CF approaches for ranking instead of rating prediction

# BPR bayesian personalized ranking - pairwise learning to rank
#for a given user u, if they liked item i,
#they should prefer i over some item j they didn’t interact with.
#Maximize probability that i ≻ j.
#is better for clicks, views (lightFM, Implicit)

#ALS Alternating least squares
# matrix factorization method
# supports explicit & implicit feedback (Spark MLlib. implicit, surprise)

#NN Based neural recommenders
#learn complex non linear interactions between user/items
# NeuMF, NCF, AutoRec, DeepFM
# use of transformers for sequence based rec


In [10]:
#adding top N recommendation generation

%%writefile topn.py

def recommend_top_n(user_id, rating_matrix, similarity_matrix, n=10, k=30, min_similarity=0.0):
  '''recommend top N items based on top K similar users'''
  if user_id not in similarity_matrix.index:
    return []

  # getting top k similar users
  sim_scores = similarity_matrix.loc[user_id].drop(user_id)
  top_k_users = sim_scores[sim_scores >= min_similarity].nlargest(k).index

  #items that the targer user has already rated
  user_rated_items = set(rating_matrix.loc[user_id][rating_matrix.loc[user_id] > 0].index)

  #items rated by top k users but not by the target user
  candidate_items = set()
  for neighbor in top_k_users:
    neighbor_rated = rating_matrix.loc[neighbor][rating_matrix.loc[neighbor] > 0 ].index
    candidate_items.update(neighbor_rated)

  candidate_items.difference_update(user_rated_items)

  #predict ratings for candidate items
  predictions = {}
  for item in candidate_items:
    numer, denom = 0.0, 0.0
    for neighbor in top_k_users:
      sim = similarity_matrix.at[user_id, neighbor]
      neighbor_rating = rating_matrix.at[neighbor, item] if item in rating_matrix.columns else 0

      if neighbor_rating >0:
        numer += sim* neighbor_rating
        denom += abs(sim)

    if denom>0:
      predictions[item] = numer / denom


  # return top N highest predicted ratings
  top_n_items = sorted(predictions.items(), key=lambda x: x[1], reverse= True)[:n]
  return top_n_items

Writing topn.py


In [11]:
# usage reference for topn

from topn import recommend_top_n

#generate top 5 recommended movie_ids for user 1 using cos similarity
top_5cosine = recommend_top_n(
    user_id=1,
    rating_matrix=user_item_filled,
    similarity_matrix = user_similarity_cosine,
    n=5,
    k=30,
    min_similarity=0.1
)

print("top 5 Cosine recommended")
print(top_5cosine)


#.generate top 5 rec movie_ids for user 1 using pears similarity
top_5pearson = recommend_top_n(
    user_id=1,
    rating_matrix=user_item_filled,
    similarity_matrix=user_similarity_pearson,
    n=5,
    k=30,
    min_similarity=0.1
)

print("top 5 Pearson")
print(top_5pearson)

top 5 Cosine recommended
[(690, np.float64(5.000000000000001)), (522, np.float64(5.0)), (641, np.float64(5.0)), (853, np.float64(5.0)), (1111, np.float64(5.0))]
top 5 Pearson
[(524, np.float64(5.0)), (1048, np.float64(5.0)), (603, np.float64(5.0)), (604, np.float64(5.0)), (650, np.float64(5.0))]


In [12]:
# showing movie titles from top_5cosine
top_movie_ids = [movie_id for movie_id, _ in top_5cosine]
recommended_titles = movies[movies['movie_id'].isin(top_movie_ids)][['movie_id', 'title']]

print("Top. 5 Cosine N recommendation for user 1")
print(recommended_titles)


Top. 5 Cosine N recommendation for user 1
      movie_id                        title
521        522           Down by Law (1986)
640        641        Paths of Glory (1957)
689        690  Seven Years in Tibet (1997)
852        853             Braindead (1992)
1110      1111      Double Happiness (1994)


In [13]:
# movie titles form top_5pearson
top_movie_ids = [movie_id for movie_id, _ in top_5pearson]
recommended_titles = movies[movies['movie_id'].isin(top_movie_ids)][['movie_id', 'title']]


print("Top. 5 Pearson N recommendation for user 1")
print(recommended_titles)

Top. 5 Pearson N recommendation for user 1
      movie_id                                            title
523        524                       Great Dictator, The (1940)
602        603                               Rear Window (1954)
603        604                     It Happened One Night (1934)
649        650  Seventh Seal, The (Sjunde inseglet, Det) (1957)
1047      1048                             She's the One (1996)


In [14]:
# top k filtering ensures only top 5 similar users are considered
# top n recommendations are about ranking items (not just predicting ratings)

#cos sim is angle based. | movies popular among similar-rating users
#pear sim is mean centered / correlation based | movies w similar rating patterns, not just high ratings

#eg cos sim| 690 and 641 may have had similar fan bases to the user
#eg pear sim| 603 and 604 are critically acclaimed classics

In [21]:
# creating evaluator.py

# with extra functions for ranking evaluation

%%writefile evaluator.py

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict

def train_test_split(ratings_df, test_size=0.2, seed=42):
    """
    splitting ratings into train/test sets (randomly)
    """
    np.random.seed(seed)
    shuffled = ratings_df.sample(frac=1, random_state=seed)
    test_count = int(len(shuffled) * test_size)


    test_df = shuffled.iloc[:test_count]
    train_df = shuffled.iloc[test_count:]

    return train_df, test_df

def evaluate(predict_fn, test_df, *predict_args):
  ''' eval prediction function on test set '''

  y_true = []
  y_pred = []

  for row in test_df.itertuples():
    pred = predict_fn(row.user_id, row.movie_id, *predict_args)
    if not np.isnan(pred):
      y_true.append(row.rating)
      y_pred.append(pred)

  rmse= np.sqrt(mean_squared_error(y_true, y_pred))
  mae = mean_absolute_error(y_true, y_pred)
  return rmse, mae


def get_top_n_recommendations(test_df, rating_matrix, similarity_matrix, recommend_fn, n=5, k=20, min_similarity=0.0):
  ''' for each user in test_df, get top n recs.'''
  user_recs = defaultdict(list)
  users = test_df['user_id'].unique()

  for user_id in users:
    recs = recommend_fn(
        user_id = user_id,
        rating_matrix = rating_matrix,
        similarity_matrix = similarity_matrix,
        n=n,
        k=k,
        min_similarity = min_similarity
    )
    if recs:
      recommended_items = [item for item, _ in recs]
      user_recs[user_id] = recommended_items
  return user_recs

def precision_recall_at_k(user_recs, test_df, k=5):
  '''computes precision@k & recall@k w/ ground truth from test_df'''
  relevant = defaultdict(set)
  for row in test_df.itertuples():
    if row.rating >= 4:
      relevant[row.user_id].add(row.movie_id)

  precisions, recalls = [], []

  for user_id, recommended_items in user_recs.items():
    true_items = relevant.get(user_id, set())
    if not true_items:
      continue

    recommended_top_k = set(recommended_items[:k])
    n_rel_and_rec = len(recommended_top_k & true_items)

    precision = n_rel_and_rec / k
    recall = n_rel_and_rec / len(true_items)

    precisions.append(precision)
    recalls.append(recall)

  avg_precision = np.mean(precisions)
  avg_recall =np.mean(recalls)
  return avg_precision, avg_recall

Overwriting evaluator.py


In [22]:
# usage reference
from data_loader import build_user_item_matrix, fill_missing_zero, center_ratings
from similarity import compute_cosine_similarity, compute_pearson_similarity
from predictor import predict_rating_cosine, predict_rating_pearson
from evaluator import train_test_split, evaluate

In [23]:
# split the original ratings
train_df, test_df = train_test_split(ratings)

# build matrices from training data only
train_user_item = build_user_item_matrix(train_df)

train_user_item_filled = fill_missing_zero(train_user_item)

train_user_item_centered, train_user_means = center_ratings(train_user_item)

# get similarities from training data
user_sim_cosine = compute_cosine_similarity(train_user_item_filled)

user_sim_pearson = compute_pearson_similarity(train_user_item_centered)


In [24]:
# evaluate cosine

rmse_cos, mae_cos = evaluate(
    predict_rating_top_k_cosine,
    test_df,
    train_user_item_filled,
    user_sim_cosine
)

print(f"Cosine Top K RMSE: {rmse_cos:4f}, Cosine Top K MAE: {mae_cos:4f}")


Cosine Top K RMSE: 1.107655, Cosine Top K MAE: 0.860426


In [25]:
# evaluate pearson

rmse_pear, mae_pear = evaluate(
    predict_rating_top_k_pearson,
    test_df,
    train_user_item_centered,
    user_sim_pearson,
    train_user_means
)

print(f"Pearson Top K RMSE: {rmse_pear:4f}, Pearson Top K MAE:{mae_pear:4f}")

Pearson Top K RMSE: 1.277526, Pearson Top K MAE:1.001483


In [None]:
# evaluate but now with top-N ranking evaluation
from evaluator import get_top_n_recommendations, precision_recall_at_k

top_n_recs = get_top_n_recommendations(
    test_df = test_df,
    rating_matrix = train_user_item_filled,
    similarity_matrix = user_sim_cosine,
    recommend_fn = recommend_top_n, #from topn.py
    n=5,
    k=30,
    min_similarity = 0.2
)

prcsn_5, rcll_5 = precision_recall_at_k(top_n_recs,  test_df, k=5)
print(f"Precision@5 : {prcsn_5:.4f},  Recall@ {n} : {rcll_5:.4f}")

#todo review results