In [1]:
import pandas as pd
import numpy as np
import random
import statistics

import sys
sys.path.append('../scripts')
import recommender

Datasets: https://grouplens.org/datasets/movielens/

In [2]:
# Dataset loading
data_dir = '../ml-latest-small'

df_movies = pd.read_csv(f"{data_dir}/movies.csv")
df_ratings = pd.read_csv(f"{data_dir}/ratings.csv")

'''
  Create a pandas df with a column for each value of the df[column_name] column (in this case reppresenting the movieId).
  In this example, each row reppresents the ratings given by a user to the specific movie (NaN means rating unknown).
'''
df_user_based_cf = df_ratings.groupby('userId').apply(lambda x: x.set_index('movieId')['rating']).unstack(fill_value=np.nan)

In [3]:
def find_nan_columns_in_group(df, group):
  group_df = df.iloc[group]
  nan_mask = group_df.isna()

  nan_counts = nan_mask.sum(axis=0)

  return nan_counts[nan_counts == len(group)].index.tolist()


def random_distinct_indexes(df, num):
  distinct_indexes = df.index.tolist()
  random.shuffle(distinct_indexes)

  return distinct_indexes[:num]

In [4]:
def average_aggregation(prediction_scores):
  return statistics.mean(prediction_scores)


def least_misery(prediction_scores):
  return min(prediction_scores)


def get_combined_predictions(group_predictions):
  first_user_predictions = group_predictions[0]
  combined_predictions = [[prediction[1], []] for prediction in first_user_predictions]

  for user_predictions in group_predictions:
    for i in range(len(user_predictions)):
      combined_predictions[i][1].append(user_predictions[i][0])

  return combined_predictions


def get_group_recommendation(df, group, neighbourhood_size=50, predictions_size=10, group_score_function=average_aggregation, score_function=recommender.pearson_correlation):
  group_neighbours = [recommender.get_neighborhood(df, target_user, neighbourhood_size, score_function) for target_user in group]

  nan_columns = find_nan_columns_in_group(df, group)
  group_predictions = [recommender.get_items_predictions_based_on_similarity(df, target_user, similar_users, nan_columns)
                       for target_user, similar_users in zip(group, group_neighbours)
                      ]

  combined_predictions = get_combined_predictions(group_predictions)
  for combined_prediction in combined_predictions:
    combined_prediction.insert(0, group_score_function(combined_prediction[1]))

  return recommender.get_top_k_predictions(combined_predictions, predictions_size)


def get_group_recommendation_with_disagreements(df, group, neighbourhood_size=50, predictions_size=10, group_predictions_size=300, group_score_function=average_aggregation):
  predictions = get_group_recommendation(df, group, neighbourhood_size, group_predictions_size, group_score_function=group_score_function)

  group_size = len(group)
  for prediction in predictions:
    predicted_score = prediction[0]
    users_predictions = prediction[2]
    st_dev = statistics.stdev(users_predictions)

    prediction[0] = ((1 / st_dev) ** (1 / group_size)) * predicted_score if st_dev > 0 else 0

  return recommender.get_top_k_predictions(predictions, predictions_size)

In [5]:
users = random_distinct_indexes(df_user_based_cf, 3)

standard_predictions = get_group_recommendation(df_user_based_cf, users, group_score_function=average_aggregation)
predictions_with_disagrements = get_group_recommendation_with_disagreements(df_user_based_cf, users)

In [6]:
def float_approx(num):
  return "{:.2f}".format(num)

def float_list_approx(nums):
  approximations = [float_approx(num) for num in nums]
  return ', '.join(approximations)

def print_group_predictions(group_predictions):
    max_id_length = max(len(str(prediction[1])) for prediction in group_predictions)

    for prediction in group_predictions:
        movie_id = prediction[1]
        predicted_score = float_approx(prediction[0])
        user_predictions = float_list_approx(prediction[2])
        print(f"Movie ID: {movie_id:<{max_id_length}}\tScore: [{predicted_score}]\t Users' Predictions: [{user_predictions}]")

print("Prediction without considering disagreements")
print_group_predictions(standard_predictions)

print("\nPrediction considering disagreements")
print_group_predictions(predictions_with_disagrements)

Prediction without considering disagreements
Movie ID: 3972	Score: [5.75]	 Users' Predictions: [5.59, 6.55, 5.11]
Movie ID: 1394	Score: [5.28]	 Users' Predictions: [4.59, 5.71, 5.53]
Movie ID: 1259	Score: [5.18]	 Users' Predictions: [5.29, 5.54, 4.72]
Movie ID: 750 	Score: [5.14]	 Users' Predictions: [4.52, 6.29, 4.61]
Movie ID: 3471	Score: [5.12]	 Users' Predictions: [4.81, 5.61, 4.94]
Movie ID: 4034	Score: [5.11]	 Users' Predictions: [4.39, 5.85, 5.10]
Movie ID: 2289	Score: [5.11]	 Users' Predictions: [5.63, 4.60, 5.09]
Movie ID: 1616	Score: [5.04]	 Users' Predictions: [4.13, 5.73, 5.27]
Movie ID: 3499	Score: [5.04]	 Users' Predictions: [4.55, 5.52, 5.06]
Movie ID: 3727	Score: [5.00]	 Users' Predictions: [4.90, 5.86, 4.25]

Prediction considering disagreements
Movie ID: 1103	Score: [13.52]	 Users' Predictions: [3.73, 3.73, 3.70]
Movie ID: 2431	Score: [10.61]	 Users' Predictions: [4.32, 4.34, 4.22]
Movie ID: 3053	Score: [8.50]	 Users' Predictions: [3.51, 3.63, 3.50]
Movie ID: 1997	Sco