# Lab 5 - Group Recommenders

## Lab Setup

 * Download and extract: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * Read more: https://grouplens.org/datasets/movielens/
 * [optional] Create virtual enviroment
 `python3 -m venv ./recsyslab5`
 * install needed libraries:
 `pip install numpy pandas matplotlib`

## Part 1 - Data preparation and preprocessing

In [1]:
# import needed packages

import math
import numpy as np
import pandas

from random import choice, sample
from statistics import mean, stdev

import pandas as pd

from reco_utils import *

In [2]:
# read user ratings and calculate expected movie ratings with collaborative filtering

raw_ratings = pandas.read_csv("ml-latest-small/ratings.csv").drop(columns=["timestamp"])
movies = list(raw_ratings["movieId"].unique())
users = list(raw_ratings["userId"].unique())
ratings = get_predicted_ratings(raw_ratings)
ratings.head()

Total error: 209368.3765321304
Total error: 203045.75593634343
Total error: 197139.04009666867
Total error: 191609.073579633
Total error: 186421.30643779968
Total error: 181545.15717202215
Total error: 176953.47702384955
Total error: 172622.09723988906
Total error: 168529.44469713667
Total error: 164656.21417695555
Total error: 160985.08783514946
Total error: 157500.494190382
Total error: 154188.40035700426
Total error: 151036.13236624908
Total error: 148032.21931570617
Total error: 145166.2578095068
Total error: 142428.79373771747
Total error: 139811.2189214662
Total error: 137305.68054231742
Total error: 134905.0015974568
Total error: 132602.6108897298
Total error: 130392.48128407296
Total error: 128269.0751477164
Total error: 126227.29604738702
Total error: 124262.44590793445
Total error: 122370.18694763038
Total error: 120546.50779932152
Total error: 118787.69330648301
Total error: 117090.29755131106
Total error: 115451.11973022803
Total error: 113867.18254208622
Total error: 11233

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,6,7,10,6,7,7,5,4,7,6,...,9,7,8,6,8,3,4,7,6,4
2,2,0,0,1,9,7,7,8,3,10,...,7,10,2,10,7,3,10,2,0,10
3,4,10,1,1,8,10,0,10,0,1,...,10,9,0,0,10,8,10,10,0,0
4,7,4,5,8,6,10,4,9,10,4,...,4,2,10,6,6,5,0,2,0,10
5,3,2,10,5,10,2,7,0,0,10,...,7,8,8,5,0,6,10,8,6,3


In [3]:
# read movie categories

movies_metadata = pandas.read_csv("ml-latest-small/movies.csv").set_index("movieId")
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
df = pandas.read_csv("groups.csv")
df.shape

(8, 5)

In [5]:
# read example user groups
groups = pandas.read_csv("groups.csv", header=None).values.tolist()
groups

[[606, 274, 474, 599, 448],
 [111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [6]:
# helper function


def describe_group(group, N=10):
    print(f"\n\nUser ids: {group}")
    group_size = len(group)

    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f"\nMean ratings deviation: {mean_stdev}")
    print(f"Median ratings deviation: {median_stdev}")
    print(f"Standard deviation of ratings deviation: {std_stdev}")

    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [
        (movies_metadata["title"][movie_id], average_scores[movie_id])
        for movie_id in list(average_scores[-N:].index)
    ]
    worst_movies = [
        (movies_metadata["title"][movie_id], average_scores[movie_id])
        for movie_id in list(average_scores[:N].index)
    ]

    print("\nBest movies:")
    for movie, score in best_movies[::-1]:
        print(f"{movie}, {score}*")
    print("\nWorst movies:")
    for movie, score in worst_movies:
        print(f"{movie}, {score}*")


describe_group(groups[7])



User ids: [243, 527, 418, 118, 370]

Mean ratings deviation: 3.1911531189839435
Median ratings deviation: 3.286335345030997
Standard deviation of ratings deviation: 0.9092501305874031

Best movies:
It Can't Be! (1975), 9.4*
Nature Calls (2012), 9.4*
Halloween III: Season of the Witch (1982), 9.4*
Beneath the Planet of the Apes (1970), 9.4*
Jack Whitehall: At Large (2017), 9.2*
Laws of Attraction (2004), 9.2*
Chasers (1994), 9.0*
Zombieland (2009), 9.0*
Happening, The (2008), 9.0*
Taming of the Shrew, The (1967), 9.0*

Worst movies:
Alaska (1996), 1.6*
Gotcha! (1985), 1.6*
Real McCoy, The (1993), 2.0*
Upside Down: The Creation Records Story (2010), 2.2*
Short Term 12 (2013), 2.2*
Sullivan's Travels (1941), 2.2*
Resident Evil: Retribution (2012), 2.4*
Hope Floats (1998), 2.4*
Stray Dog (Nora inu) (1949), 2.4*
Grand Theft Parsons (2003), 2.4*


## Part 2 - simple algorithms

In [7]:
# define an interface for all recommending algorithms


class Recommender:
    def recommend(self, ratings, group, size):
        pass


# random algorithm - baseline and comparison


class RandomRecommender(Recommender):
    def __init__(self):
        self.name = "random"

    def recommend(self, ratings, group, size):
        return pd.Index(sample(ratings.columns.to_list(), size))

In [8]:
# recommend movies with the highest average rating


class AverageRecommender(Recommender):
    def __init__(self):
        self.name = "average"

    def recommend(self, ratings, group, size):
        return ratings.loc[group,].mean().sort_values(ascending=False).iloc[:size].index

In [9]:
# highest average rating but without movies with at least one score below the threshold


class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = "average_without_misery"
        self.score_threshold = score_threshold

    def recommend(self, ratings, group, size):
        rating = ratings.loc[group,]
        idx = rating.columns[(rating > self.score_threshold).all(axis=0)]
        return (
            rating.loc[:, idx]
            .mean()
            .sort_values(ascending=False)[: min(size, len(idx))]
            .index
        )

In [10]:
# preferences of only one user in every iteration


class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = "fairness"

    def recommend(self, ratings, group, size):
        preference = {
            g: ratings.loc[g,].sort_values(ascending=False).index for g in group
        }
        idx = {g: 0 for g in group}
        result = set()
        for i in range(size):
            g = group[i % len(group)]
            while preference[g][idx[g]] in result:
                idx[g] += 1
            result.add(preference[g][idx[g]])
        return pd.Index(result)

In [11]:
# election algorithms (dictatorship, Bord, Copeland, simple vote)


class VotingRecommender(Recommender):
    def __init__(self):
        self.name = "Bord"

    def recommend(self, ratings, group, size):
        rating = ratings.loc[group,].copy()
        full_range = np.arange(ratings.shape[1])
        for g in group:
            rating.loc[g, rating.loc[g,].sort_values().index] = full_range
        return rating.sum(axis=0).sort_values(ascending=False).iloc[:size].index

In [12]:
# greedy algorithm, approximate Proportional Approval Voting
# in every iteration chose the movie that improves the PAV score


class ProportionalApprovalVotingRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = "PAV"

    def recommend(self, ratings, group, size):
        ratings = (ratings.loc[group,] > self.threshold).astype("float32").copy()
        result = []
        points = np.ones(len(group)).reshape(-1, 1)
        for _ in range(size):
            scores = ratings / points
            scores = scores.sum(axis=0).sort_values(ascending=False)
            best_idx = scores.iloc[:1].index.values[0]
            points += ratings.loc[:, best_idx].to_numpy().reshape(-1, 1)
            result.append(best_idx)
            ratings.loc[:, best_idx] *= 0
        return pd.Index(result)

In [13]:
# 2 helper functions
# find user's favourite movies
# calculate the sum of ratings given by a user for all movies in a give recommendation


def top_n_movies_for_user(ratings, user_id, n):
    return ratings.loc[user_id,].sort_values(ascending=False)[:n].index


def total_score(recommendation, user_id, ratings):
    return ratings.loc[user_id, recommendation].sum()

In [14]:
# function to calculate the satisfaction score for each user
# fraction - satisfaction from generated recommendation/satisfaction from ideal recommendation
def overall_user_satisfaction(recommendation, user_id, ratings):
    num = total_score(recommendation, user_id, ratings)
    den = total_score(
        top_n_movies_for_user(ratings, user_id, len(recommendation)), user_id, ratings
    )
    return num / den


# objective function - average satisfaction for all users in a group
def overall_group_satisfaction(recommendation, group, ratings):
    return np.average(
        [overall_user_satisfaction(recommendation, id, ratings) for id in group]
    )


# objective function - difference between the highest and lowest satisfaction score
def group_disagreement(recommendation, group, ratings):
    group_satisfaction = [
        overall_user_satisfaction(recommendation, id, ratings) for id in group
    ]
    return np.max(group_satisfaction) - np.min(group_satisfaction)

## Part 4 - Sequential Hybrid Aggregation

In [15]:
# Algorithm to balance between highest scored elements and highest minimal score
# in each iteration calcuate the alpha parameter
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = "sequential_hybrid_aggregation"

    def recommend(self, ratings, group, size):
        rating = ratings.loc[group,].copy()
        alpha = 0
        for _ in range(10):
            score = (1 - alpha) * rating.mean() + alpha * rating.min(axis=0)
            recommendation = score.sort_values(ascending=False).iloc[:size].index
            alpha = group_disagreement(recommendation, group, ratings)
        return recommendation

## Part 5 - Algorithm comparison

In [16]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(),
    ProportionalApprovalVotingRecommender(5),
    SequentialHybridAggregationRecommender(),
]

recommendation_size = 10

# for each algorithm:
#  - generate one recommendation for each group
#  - calculate the value of both objective functions for each recommendation
#  - calculate mean and standard deviation for both objective functions
#  - print the results

size = 10

for recommender in recommenders:
    print(recommender.name)
    recomendations = [recommender.recommend(ratings, g, size) for g in groups]
    satisfactions = [
        overall_group_satisfaction(r, g, ratings)
        for r, g in zip(recomendations, groups)
    ]
    disagreements = [
        group_disagreement(r, g, ratings) for r, g in zip(recomendations, groups)
    ]
    print(
        f"satisfaction\n - mean :               {np.mean(satisfactions)}\n - standard deviation : {np.std(satisfactions)}"
    )
    print(
        f"disagreement\n - mean :               {np.mean(disagreements)}\n - standard deviation : {np.std(disagreements)}"
    )
    print()

random
satisfaction
 - mean :               0.6560108604845447
 - standard deviation : 0.11157338655768212
disagreement
 - mean :               0.1965079365079365
 - standard deviation : 0.1057295379625903

average
satisfaction
 - mean :               0.9669224450013925
 - standard deviation : 0.022207924044160627
disagreement
 - mean :               0.06254385964912282
 - standard deviation : 0.057886430850763504

average_without_misery
satisfaction
 - mean :               0.9668430799220273
 - standard deviation : 0.022330910451695513
disagreement
 - mean :               0.07087719298245615
 - standard deviation : 0.05702115166640759

fairness
satisfaction
 - mean :               0.7260971874129768
 - standard deviation : 0.11439173739353786
disagreement
 - mean :               0.18808201058201057
 - standard deviation : 0.11044192539633478

Bord
satisfaction
 - mean :               0.9469387357282094
 - standard deviation : 0.030242758621731782
disagreement
 - mean :               0