<a href="https://colab.research.google.com/github/menicacci/fairness-group-recommendations/blob/main/User_based_Collaborative_Filtering_Recommendations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import math
import heapq
import numpy as np

Datasets: https://grouplens.org/datasets/movielens/

In [2]:
# Dataset loading
df_movies = pd.read_csv(r"movies.csv")
df_ratings = pd.read_csv(r"ratings.csv")

# df_tags = pd.read_csv(r"tags.csv")
# df_links = pd.read_csv(r"links.csv")

In [3]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
'''
  Create a pandas df with a column for each value of the df[column_name] column (in this case reppresenting the movieId).
  In this example, each row reppresents the ratings given by a user to the specific movie (NaN means rating unknown).
'''
df_user_based_cf = df_ratings.groupby('userId').apply(lambda x: x.set_index('movieId')['rating']).unstack(fill_value=np.nan)

df_user_based_cf.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [6]:
'''
  Takes as input a pandas df and two row indexes.
  For a given column c, if df[row_idx1] and df[row_idx2] are both populated, the pair will be included in the output array.
  If the indexes are the same, it returns an empty array
'''
def find_non_null_column_pairs(df, row_idx1, row_idx2):
  if row_idx1 == row_idx2:
    return []

  row1 = df.iloc[row_idx1].values
  row2 = df.iloc[row_idx2].values

  non_null_mask = ~np.isnan(row1) & ~np.isnan(row2)
  return [(val1, val2) for val1, val2 in zip(row1[non_null_mask], row2[non_null_mask])]


'''
  Calculates the average value of a row (excluding NaN values)
'''
def average_value(df, row_idx):
  return np.nanmean(df.iloc[row_idx].to_numpy())


'''
  Takes as input a pandas df and two row indexes.
  Calculates the peason correlation between two items.
'''
def pearson_correlation(df, row_idx1, row_idx2):
  common_items = find_non_null_column_pairs(df, row_idx1, row_idx2)
  if not common_items:
    return 0

  mean_1 = average_value(df, row_idx1)
  mean_2 = average_value(df, row_idx2)

  n = sum((item[0] - mean_1) * (item[1] - mean_2) for item in common_items)
  d1 = math.sqrt(sum((item[0] - mean_1)**2 for item in common_items))
  d2 = math.sqrt(sum((item[1] - mean_2)**2 for item in common_items))

  return n / (d1 * d2) if (d1 != 0 and d2 != 0) else 0


'''
  Takes as input a pandas df and two row indexes.
  Calculates the cosine similarity between two items.
'''
def cosine_similarity(df, row_idx1, row_idx2):
    common_items = find_non_null_column_pairs(df, row_idx1, row_idx2)
    if not common_items:
        return 0

    dot_product = sum(item[0] * item[1] for item in common_items)
    magnitude1 = math.sqrt(sum(item[0] ** 2 for item in common_items))
    magnitude2 = math.sqrt(sum(item[1] ** 2 for item in common_items))

    return dot_product / (magnitude1 * magnitude2) if (magnitude1 != 0 and magnitude2 != 0) else 0


'''
  Takes as input a pandas df, the index of the target item, the size of the neighbourhood and a score function.
  Returns the neighbourhood that maximizes the score function ordered by the score itself.
  Output type: [(a, b), ...] -> a: item score, b: item index.
'''
def get_max_similarity(df, target_idx, size, score_function):
  top_scores_heap = []

  for row_idx in range(len(df)):
    score = score_function(df, target_idx, row_idx)
    heapq.heappush(top_scores_heap, (score, row_idx))

    if len(top_scores_heap) > size:
      heapq.heappop(top_scores_heap)

  return sorted(top_scores_heap, key=lambda x: x[0])

In [7]:
# Find similar users
neighbourhood_size = 10
user_idx = 0

similar_users = get_max_similarity(df_user_based_cf, user_idx, neighbourhood_size, pearson_correlation)

In [8]:
for user in similar_users[:2]:
  print(f"User: {user[1] + 1}, Similarity: -> {user[0]}")

User: 278, Similarity: -> 0.9710607611177227
User: 146, Similarity: -> 0.9990496408681655


In [9]:
'''
  Takes as input a pandas df, a column value, a list of neighbours and the mean of the target item values.
  Returns a prediction score for the target item.
  Neighbour's list structure: [(a, b, c), ...] -> a: score, b: index, c: mean
'''
def get_prediction_score(df, column, similar_items, target_mean):
  similar_items_arr = np.array(similar_items, dtype=float)

  item_indices = similar_items_arr[:, 1].astype(int)
  item_similarities = similar_items_arr[:, 0]
  item_means = similar_items_arr[:, 2]

  df_values = df.iloc[item_indices][column].values
  valid_indices = ~np.isnan(df_values)

  n = np.sum(item_similarities[valid_indices] * (df_values[valid_indices] - item_means[valid_indices]))
  d = np.sum(np.abs(item_similarities[valid_indices]))

  return target_mean + (n / d) if d != 0 else None


'''
  Takes as input a pandas df, a target item index, a list of neighbours and the desired output list length.
  Returns a list of column values with the highest score.
  Output type: [(a, b), ...] -> a: prediction score, b: item index.
'''
def get_predictions_based_on_similarity(df, target_item, similar_items, size):
  prediction_scores = []

  target_item_mean = average_value(df, target_item)
  similar_items = [(similar_item[0], similar_item[1], average_value(df, similar_item[1])) for similar_item in similar_items]

  row_item = df.iloc[target_item]
  for column, index in df.items():
    if np.isnan(row_item[column]):
      prediction = get_prediction_score(df, column, similar_items, target_item_mean)

      if prediction is not None:
        heapq.heappush(prediction_scores, (prediction, column))

        if len(prediction_scores) > size:
          heapq.heappop(prediction_scores)

  return sorted(prediction_scores, key=lambda x: x[0], reverse=True)



'''
  Returns a list of predictions for an item (in this case, a user)
'''
def get_predictions(df, target_user, neighbourhood_size=20, predictions_size=10, score_function=pearson_correlation):
  similar_users = get_max_similarity(df, target_user, neighbourhood_size, score_function)
  predictions = get_predictions_based_on_similarity(df, target_user, similar_users, predictions_size)

  return predictions

In [10]:
prediction_size = 10
neighbourhood_size = 50

predictions = get_predictions(df_user_based_cf, 0, neighbourhood_size, prediction_size, pearson_correlation)

In [11]:
for prediction in predictions:
  print(f"Movie ID: {prediction[1]}\t\tRating: {prediction[0]}")

Movie ID: 319		Rating: 6.769157088122605
Movie ID: 3567		Rating: 6.726379310344827
Movie ID: 555		Rating: 6.641379310344828
Movie ID: 913		Rating: 6.252742946708464
Movie ID: 55276		Rating: 6.252742946708464
Movie ID: 30803		Rating: 6.225754310344827
Movie ID: 27611		Rating: 6.223522167487685
Movie ID: 3972		Rating: 6.223522167487685
Movie ID: 42728		Rating: 6.110281749369218
Movie ID: 55052		Rating: 6.110281749369218
