<a href="https://colab.research.google.com/github/menicacci/fairness-group-recommendations/blob/main/User_based_Collaborative_Filtering_Recommendations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import math
import heapq
import numpy as np

Datasets: https://grouplens.org/datasets/movielens/

In [2]:
# Dataset loading
data_dir = 'ml-latest-small'

df_movies = pd.read_csv(f"{data_dir}/movies.csv")
df_ratings = pd.read_csv(f"{data_dir}/ratings.csv")

# df_tags = pd.read_csv(r"tags.csv")
# df_links = pd.read_csv(r"links.csv")

In [3]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
'''
  Create a pandas df with a column for each value of the df[column_name] column (in this case reppresenting the movieId).
  In this example, each row reppresents the ratings given by a user to the specific movie (NaN means rating unknown).
'''
df_user_based_cf = df_ratings.groupby('userId').apply(lambda x: x.set_index('movieId')['rating']).unstack(fill_value=np.nan)

df_user_based_cf.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [6]:
'''
  Takes as input a pandas df and two row indexes.
  For a given column c, if df[row_idx1] and df[row_idx2] are both populated, the pair will be included in the output array.
  If the indexes are the same, it returns an empty array
'''
def find_non_null_column_pairs(df, row_idx1, row_idx2):
  if row_idx1 == row_idx2:
    return []

  row1 = df.iloc[row_idx1].values
  row2 = df.iloc[row_idx2].values

  non_null_mask = ~np.isnan(row1) & ~np.isnan(row2)
  return [(val1, val2) for val1, val2 in zip(row1[non_null_mask], row2[non_null_mask])]


'''
  Calculates the average value of a row (excluding NaN values)
'''
def average_value(df, row_idx):
  return np.nanmean(df.iloc[row_idx].to_numpy())


'''
  Takes as input a pandas df and two row indexes.
  Calculates the peason correlation between two items.
'''
def pearson_correlation(df, row_idx1, row_idx2):
  common_items = find_non_null_column_pairs(df, row_idx1, row_idx2)
  if not common_items:
    return 0

  mean_1 = average_value(df, row_idx1)
  mean_2 = average_value(df, row_idx2)

  n = sum((item[0] - mean_1) * (item[1] - mean_2) for item in common_items)
  d1 = math.sqrt(sum((item[0] - mean_1)**2 for item in common_items))
  d2 = math.sqrt(sum((item[1] - mean_2)**2 for item in common_items))

  return n / (d1 * d2) if (d1 != 0 and d2 != 0) else 0


'''
  Takes as input a pandas df and two row indexes.
  Calculates the cosine similarity between two items.
'''
def cosine_similarity(df, row_idx1, row_idx2):
    common_items = find_non_null_column_pairs(df, row_idx1, row_idx2)
    if not common_items:
        return 0

    dot_product = sum(item[0] * item[1] for item in common_items)
    magnitude1 = math.sqrt(sum(item[0] ** 2 for item in common_items))
    magnitude2 = math.sqrt(sum(item[1] ** 2 for item in common_items))

    return dot_product / (magnitude1 * magnitude2) if (magnitude1 != 0 and magnitude2 != 0) else 0


'''
  Takes as input a pandas df, the index of the target user, the size of the neighbourhood and a score function.
  Returns the neighbourhood that maximizes the score function ordered by the score itself.
  Output type: [(a, b), ...] -> a: item score, b: item index.
'''
def get_neighborhood(df, target_idx, size, score_function=pearson_correlation):
  top_scores_heap = []

  for row_idx in range(len(df)):
    score = score_function(df, target_idx, row_idx)
    heapq.heappush(top_scores_heap, (score, row_idx))

    if len(top_scores_heap) > size:
      heapq.heappop(top_scores_heap)

  return sorted(top_scores_heap, key=lambda x: x[0], reverse=True)

In [7]:
# Find similar users
neighbourhood_size = 20
user_idx = 0

similar_users = get_neighborhood(df_user_based_cf, user_idx, neighbourhood_size)

In [8]:
for user in similar_users:
  similarity_score = "{:.2f}".format(user[0])
  print(f"Similarity Score: [{similarity_score}]\t User ID: [{user[1] + 1}]")

Similarity Score: [1.00]	 User ID: [77]
Similarity Score: [1.00]	 User ID: [12]
Similarity Score: [1.00]	 User ID: [388]
Similarity Score: [1.00]	 User ID: [291]
Similarity Score: [1.00]	 User ID: [253]
Similarity Score: [1.00]	 User ID: [85]
Similarity Score: [1.00]	 User ID: [358]
Similarity Score: [1.00]	 User ID: [2]
Similarity Score: [1.00]	 User ID: [146]
Similarity Score: [0.97]	 User ID: [278]
Similarity Score: [0.95]	 User ID: [550]
Similarity Score: [0.95]	 User ID: [13]
Similarity Score: [0.94]	 User ID: [127]
Similarity Score: [0.94]	 User ID: [333]
Similarity Score: [0.93]	 User ID: [472]
Similarity Score: [0.90]	 User ID: [157]
Similarity Score: [0.89]	 User ID: [139]
Similarity Score: [0.87]	 User ID: [401]
Similarity Score: [0.87]	 User ID: [511]
Similarity Score: [0.84]	 User ID: [473]


In [9]:
'''
  Takes as input a pandas df, a column value, a list of neighbours and the mean of the target user's row values.
  Returns a prediction score for the target user.
  Neighbour's list structure: [(a, b, c), ...] -> a: score, b: index, c: mean
'''
def get_prediction_score(df_values, valid_indexes, column, similar_items, target_mean):
  item_indices = similar_items[:, 1]
  item_similarities = similar_items[:, 0]
  item_means = similar_items[:, 2]

  n = np.sum(item_similarities[valid_indexes] * (df_values - item_means[valid_indexes]))
  d = np.sum(np.abs(item_similarities[valid_indexes]))

  return target_mean + (n / d) if d != 0 else 0


'''
  Takes as input a pandas df, a target user index, a list of neighbours and the desired output list length.
  Returns a list of items with the highest predicted rating, with the rating as well.
  Output type: [(a, b), ...] -> a: predicted rating, b: item index.
'''
def get_recommendations_based_on_similarity(df, target_user, similar_users, size):
  user_row = df.iloc[target_user]
  user_nan_columns = user_row[user_row.isna()].index.tolist()

  top_predictions = []
  predictions = get_items_predictions_based_on_similarity(df, target_user, similar_users, user_nan_columns)
  for prediction in predictions:
    heapq.heappush(top_predictions, prediction)

    if len(top_predictions) > size:
      heapq.heappop(top_predictions)

  return sorted(top_predictions, key=lambda x: x[0], reverse=True)


'''
  Takes as input a pandas df, a target user index, a list of neighbours and a list of items to predict the rating.
  Returns a list of items and their predicted rating.
  Output type: [(a, b), ...] -> a: predicted rating, b: item index.
'''
def get_items_predictions_based_on_similarity(df, target_user, similar_users, columns):
  predictions = []

  similar_users = [(similar_user[0], similar_user[1], average_value(df, similar_user[1])) for similar_user in similar_users]
  similar_users = np.array(similar_users, dtype=float)
  similar_users[:, 1] = similar_users[:, 1].astype(int)

  target_user_mean = average_value(df, target_user)

  df_values = df.iloc[similar_users[:, 1].astype(int)][columns].values
  nan_indexes = np.isnan(df_values)
  for i, column in enumerate(columns):
    valid_indexes = ~nan_indexes[:, i]
    valid_values = df_values[valid_indexes, i]

    prediction_score = get_prediction_score(valid_values, valid_indexes, column, similar_users, target_user_mean)
    predictions.append((prediction_score, column))

  return predictions


'''
  Returns a list of predictions for a user
'''
def get_predictions(df, target_user, neighbourhood_size=20, predictions_size=10, score_function=pearson_correlation):
  similar_users = get_neighborhood(df, target_user, neighbourhood_size, score_function)
  predictions = get_recommendations_based_on_similarity(df, target_user, similar_users, predictions_size)

  return predictions

In [10]:
target_user = 55
prediction_size = 10
neighbourhood_size = 50

predictions = get_predictions(df_user_based_cf, target_user, neighbourhood_size, prediction_size, pearson_correlation)

In [11]:
for prediction in predictions:
  predicted_score = "{:.2f}".format(prediction[0])
  print(f"Predicted Rating: [{predicted_score}]\tMovie ID: {prediction[1]}")

Predicted Rating: [5.64]	Movie ID: 430
Predicted Rating: [5.64]	Movie ID: 2622
Predicted Rating: [5.53]	Movie ID: 368
Predicted Rating: [5.50]	Movie ID: 6732
Predicted Rating: [5.48]	Movie ID: 922
Predicted Rating: [5.48]	Movie ID: 2728
Predicted Rating: [5.48]	Movie ID: 7293
Predicted Rating: [5.48]	Movie ID: 8827
Predicted Rating: [5.48]	Movie ID: 69069
Predicted Rating: [5.47]	Movie ID: 6461
