<a href="https://colab.research.google.com/github/menicacci/fairness-group-recommendations/blob/main/Group_Recommendations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import math
import heapq
import numpy as np
import random
import statistics

Datasets: https://grouplens.org/datasets/movielens/

In [2]:
# Dataset loading
df_movies = pd.read_csv(r"movies.csv")
df_ratings = pd.read_csv(r"ratings.csv")


'''
  Create a pandas df with a column for each value of the df[column_name] column (in this case reppresenting the movieId).
  In this example, each row reppresents the ratings given by a user to the specific movie (NaN means rating unknown).
'''
df_user_based_cf = df_ratings.groupby('userId').apply(lambda x: x.set_index('movieId')['rating']).unstack(fill_value=np.nan)

In [3]:
'''
  Takes as input a pandas df and two row indexes.
  For a given column c, if df[row_idx1] and df[row_idx2] are both populated, the pair will be included in the output array.
  If the indexes are the same, it returns an empty array
'''
def find_non_null_column_pairs(df, row_idx1, row_idx2):
  if row_idx1 == row_idx2:
    return []

  row1 = df.iloc[row_idx1].values
  row2 = df.iloc[row_idx2].values

  non_null_mask = ~np.isnan(row1) & ~np.isnan(row2)
  return [(val1, val2) for val1, val2 in zip(row1[non_null_mask], row2[non_null_mask])]


'''
  Calculates the average value of a row (excluding NaN values)
'''
def average_value(df, row_idx):
  return np.nanmean(df.iloc[row_idx].to_numpy())


'''
  Takes as input a pandas df and two row indexes.
  Calculates the peason correlation between two items.
'''
def pearson_correlation(df, row_idx1, row_idx2):
  common_items = find_non_null_column_pairs(df, row_idx1, row_idx2)
  if not common_items:
    return 0

  mean_1 = average_value(df, row_idx1)
  mean_2 = average_value(df, row_idx2)

  n = sum((item[0] - mean_1) * (item[1] - mean_2) for item in common_items)
  d1 = math.sqrt(sum((item[0] - mean_1)**2 for item in common_items))
  d2 = math.sqrt(sum((item[1] - mean_2)**2 for item in common_items))

  return n / (d1 * d2) if (d1 != 0 and d2 != 0) else 0


'''
  Takes as input a pandas df and two row indexes.
  Calculates the cosine similarity between two items.
'''
def cosine_similarity(df, row_idx1, row_idx2):
    common_items = find_non_null_column_pairs(df, row_idx1, row_idx2)
    if not common_items:
        return 0

    dot_product = sum(item[0] * item[1] for item in common_items)
    magnitude1 = math.sqrt(sum(item[0] ** 2 for item in common_items))
    magnitude2 = math.sqrt(sum(item[1] ** 2 for item in common_items))

    return dot_product / (magnitude1 * magnitude2) if (magnitude1 != 0 and magnitude2 != 0) else 0


'''
  Takes as input a pandas df, the index of the target item, the size of the neighbourhood and a score function.
  Returns the neighbourhood that maximizes the score function ordered by the score itself.
  Output type: [(a, b), ...] -> a: item score, b: item index.
'''
def get_max_similarity(df, target_idx, size, score_function):
  top_scores_heap = []

  for row_idx in range(len(df)):
    score = score_function(df, target_idx, row_idx)
    heapq.heappush(top_scores_heap, (score, row_idx))

    if len(top_scores_heap) > size:
      heapq.heappop(top_scores_heap)

  return sorted(top_scores_heap, key=lambda x: x[0])

In [4]:
'''
  Takes as input a pandas df, a column value, a list of neighbours and the mean of the target item values.
  Returns a prediction score for the target item.
  Neighbour's list structure: [(a, b, c), ...] -> a: score, b: index, c: mean
'''
def get_prediction_score(df, column, similar_items, target_mean):
  similar_items_arr = np.array(similar_items, dtype=float)

  item_indices = similar_items_arr[:, 1].astype(int)
  item_similarities = similar_items_arr[:, 0]
  item_means = similar_items_arr[:, 2]

  df_values = df.iloc[item_indices][column].values
  valid_indices = ~np.isnan(df_values)

  n = np.sum(item_similarities[valid_indices] * (df_values[valid_indices] - item_means[valid_indices]))
  d = np.sum(np.abs(item_similarities[valid_indices]))

  return target_mean + (n / d) if d != 0 else None



def refactor_similarities(df, similar_items):
  return [(similar_item[0], similar_item[1], average_value(df, similar_item[1])) for similar_item in similar_items]


'''
  Takes as input a pandas df, a target item index, a list of neighbours and the desired output list length.
  Returns a list of column values with the highest score.
  Output type: [(a, b), ...] -> a: prediction score, b: item index.
'''
def get_predictions_based_on_similarity(df, target_item, similar_items, size, column_indexes=None):
  prediction_scores = []

  target_item_mean = average_value(df, target_item)
  similar_items = refactor_similarities(df, similar_items)

  row_item = df.iloc[target_item]

  columns = df.columns if column_indexes is None else column_indexes
  for column in columns:
    if np.isnan(row_item[column]):
      prediction = get_prediction_score(df, column, similar_items, target_item_mean)

      if prediction is not None:
        heapq.heappush(prediction_scores, (prediction, column))

        if len(prediction_scores) > size:
          heapq.heappop(prediction_scores)

  return sorted(prediction_scores, key=lambda x: x[0])

In [5]:
def find_nan_columns_in_group(df, group):
  unseen = {}

  for item in group:
    row_item = df.iloc[item]
    unseen[item] = []

    for column, index in df.items():
      if np.isnan(row_item[column]):
        unseen[item].append(column)

  intersection_set = set(unseen[group[0]])
  for key in group[1:]:
    intersection_set.intersection_update(unseen[key])

  return intersection_set


def random_distinct_indexes(df, num):
  distinct_indexes = df.index.tolist()
  random.shuffle(distinct_indexes)

  return distinct_indexes[:num]

In [6]:
def average_aggregation(prediction_scores):
  return statistics.mean(prediction_scores)


def least_misery(prediction_scores):
  return min(prediction_scores)


def get_combined_predictions(df, group, group_predictions, group_neighbours):
  predictions_map = [{prediction[1]: prediction[0] for prediction in item_predictions} for item_predictions in group_predictions]

  combined_predictions = {}
  for item_predictions in predictions_map:
    for prediction in item_predictions.keys():
      combined_predictions[prediction] = []

  average_values = [average_value(df, item) for item in group]
  for prediction in combined_predictions.keys():
    for item_index in range(len(group)):
      if not prediction in predictions_map[item_index]:
        new_prediction = get_prediction_score(df, prediction, group_neighbours[item_index], average_values[item_index])
        if new_prediction is None:
          new_prediction = 0

        combined_predictions[prediction].append(new_prediction)
      else:
        combined_predictions[prediction].append(predictions_map[item_index][prediction])

  return combined_predictions


def get_group_recommendation(df, group, neighbourhood_size=50, item_prediction_size=100, predictions_size=10, group_score_function=average_aggregation, score_function=pearson_correlation):
  group_neighbours = [refactor_similarities(df, get_max_similarity(df, target, neighbourhood_size, score_function)) for target in group]

  nan_columns = find_nan_columns_in_group(df, group)
  group_predictions = [get_predictions_based_on_similarity(df, target, similar_users, item_prediction_size, column_indexes=nan_columns)
                       for target, similar_users in zip(group, group_neighbours)
                      ]

  combined_predictions = get_combined_predictions(df, group, group_predictions, group_neighbours)

  predictions_scores = {prediction: [combined_predictions[prediction], group_score_function(combined_predictions[prediction])] for prediction in combined_predictions.keys()}
  sorted_predictions = sorted(predictions_scores.items(), key=lambda x: x[1][1], reverse=True)

  return sorted_predictions[:predictions_size]


def get_group_recommendation_with_disagreements(df, group, predictions_size=10, group_score_function=average_aggregation):
  predictions = get_group_recommendation(df, group, item_prediction_size=500, predictions_size=200, group_score_function=group_score_function)

  group_size = len(group)
  for prediction in predictions:
    prediction[1][1] = ((1 / statistics.stdev(prediction[1][0]))**(1/group_size)) * prediction[1][1]

  sorted_predictions = sorted(predictions, key=lambda x: x[1][1], reverse=True)
  return sorted_predictions[:predictions_size]

In [7]:
users =  random_distinct_indexes(df_user_based_cf, 3)

standard_predictions = get_group_recommendation(df_user_based_cf, users, group_score_function=average_aggregation)
predictions_with_disagrements = get_group_recommendation_with_disagreements(df_user_based_cf, users)

In [10]:
def print_group_predictions(group_predictions):
  for prediction in group_predictions:
    print(f"Movie ID:\t{prediction[0]}\t\tScore: [{prediction[1][1]}]\t Users' Predictions: {prediction[1][0]}")

print("Prediction without considering disagreements")
print_group_predictions(standard_predictions)

print("\nPrediction considering disagreements")
print_group_predictions(predictions_with_disagrements)

Prediction without considering disagreements
Movie ID:	2300		Score: [5.278852475436524]	 Users' Predictions: [4.716783216783217, 5.798452468680914, 5.321321740845441]
Movie ID:	3421		Score: [5.100084459944183]	 Users' Predictions: [3.9761341739253937, 6.491140215716487, 4.832978990190668]
Movie ID:	1673		Score: [5.094998476992824]	 Users' Predictions: [4.8701923076923075, 5.6939374185136895, 4.720865704772475]
Movie ID:	933		Score: [4.973349437764876]	 Users' Predictions: [4.542051282051283, 5.174706649282921, 5.203290381960423]
Movie ID:	4011		Score: [4.89961328939587]	 Users' Predictions: [4.340374820425755, 5.829128661709879, 4.529336386051978]
Movie ID:	1207		Score: [4.796935747217571]	 Users' Predictions: [4.558070104844298, 5.353439680957129, 4.479297455851285]
Movie ID:	6711		Score: [4.785359746750945]	 Users' Predictions: [4.273097131447958, 5.144940575608309, 4.938041533196569]
Movie ID:	1293		Score: [4.767060584714325]	 Users' Predictions: [3.9851116625310175, 5.2422632103688

In [None]:
# Testing
experiments = 10
group_size = 3
score_functions = [average_aggregation, least_misery]

results_wo_dis = []
results_wt_dis = []
for _ in range(experiments):
  users_exp = random_distinct_indexes(df_user_based_cf, group_size)
  wo_dis = []
  wt_dis = []

  for function in score_functions:
    wo_dis.append([pred[1][0] for pred in get_group_recommendation(df_user_based_cf, users_exp, item_prediction_size=500, group_score_function=function)])
    wt_dis.append([pred[1][0] for pred in get_group_recommendation_with_disagreements(df_user_based_cf, users_exp, group_score_function=function)])

  results_wo_dis.append(wo_dis)
  results_wt_dis.append(wt_dis)