# Lab 5 - Group Recommenders

## Lab Setup

 * Download and extract: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * Read more: https://grouplens.org/datasets/movielens/
 * [optional] Create virtual enviroment
 `python3 -m venv ./recsyslab5`
 * install needed libraries:
 `pip install numpy pandas matplotlib`

## Part 1 - Data preparation and preprocessing

In [1]:
# import needed packages

import math
import numpy as np
import pandas

from random import choice, sample
from statistics import mean, stdev

from reco_utils import *

In [2]:
# read user ratings and calculate expected movie ratings with collaborative filtering

raw_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
movies = list(raw_ratings['movieId'].unique())
users = list(raw_ratings['userId'].unique())
ratings = get_predicted_ratings(raw_ratings)
ratings

Total error: 206753.5768846482
Total error: 200426.5449988558
Total error: 194542.36289956825
Total error: 189055.95861007823
Total error: 183927.98457668154
Total error: 179123.96546457818
Total error: 174613.591590174
Total error: 170370.12967927157
Total error: 166369.92882904835
Total error: 162592.00424313766
Total error: 159017.68489771418
Total error: 155630.31406491317
Total error: 152414.99377303995
Total error: 149358.36597050272
Total error: 146448.42449276592
Total error: 143674.3529908809
Total error: 141026.3848279612
Total error: 138495.6816327906
Total error: 136074.22775299044
Total error: 133754.73830094835
Total error: 131530.57885496898
Total error: 129395.695182089
Total error: 127344.55160045293
Total error: 125372.07680803278
Total error: 123473.61617876054
Total error: 121644.88967308539
Total error: 119881.95463265023
Total error: 118181.17283224627
Total error: 116539.18124973624
Total error: 114952.86608892484
Total error: 113419.33965355462
Total error: 1119

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,6,7,8,4,3,8,5,6,9,5,...,4,6,10,6,6,7,5,5,6,4
2,6,4,1,5,3,0,10,2,0,4,...,10,10,0,7,10,2,5,9,3,9
3,6,4,9,0,0,10,3,4,4,0,...,8,1,9,8,6,10,10,4,5,6
4,5,5,3,6,9,7,6,10,8,8,...,6,9,7,0,7,9,10,8,9,6
5,5,10,10,3,6,9,4,10,0,8,...,10,9,3,6,0,10,10,7,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,6,6,6,6,6,6,6,6,6,6,...,6,6,7,6,6,6,6,6,7,6
607,7,8,2,9,4,7,5,4,3,8,...,3,4,6,10,0,10,2,6,7,10
608,6,6,6,6,6,6,7,5,6,7,...,5,7,6,7,6,5,5,6,6,6
609,3,0,10,0,7,10,10,7,10,4,...,0,10,10,0,10,0,6,2,0,10


In [3]:
# read movie categories

movies_metadata = pandas.read_csv('ml-latest-small/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
df = pandas.read_csv('groups.csv')
df.shape

(8, 5)

In [5]:
# read example user groups
groups = pandas.read_csv('groups.csv',header=None).values.tolist()
groups

[[606, 274, 474, 599, 448],
 [111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [6]:
# helper function

def describe_group(group, N=10):
    print(f'\n\nUser ids: {group}')
    group_size = len(group)
    
    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')
    
    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[-N:].index)]
    worst_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[:N].index)]
    
    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')

describe_group(groups[7])



User ids: [243, 527, 418, 118, 370]

Mean ratings deviation: 3.139796671117475
Median ratings deviation: 3.1937438845342627
Standard deviation of ratings deviation: 0.890420398263921

Best movies:
Fathers' Day (1997), 9.6*
National Security (2003), 9.6*
Conformist, The (Conformista, Il) (1970), 9.6*
Three O'Clock High (1987), 9.6*
Divided We Fall (Musíme si pomáhat) (2000), 9.4*
Modern Problems (1981), 9.4*
King and I, The (1956), 9.4*
Runaway Bride (1999), 9.4*
Eden Lake (2008), 9.4*
Benny & Joon (1993), 9.4*

Worst movies:
Rocky IV (1985), 0.8*
Hyde Park on Hudson (2012), 1.0*
Sorority Babes in the Slimeball Bowl-O-Rama (1988), 1.2*
Last Stand, The (2013), 1.2*
Big Green, The (1995), 1.2*
Romy and Michele's High School Reunion (1997), 1.2*
Vegas Vacation (National Lampoon's Las Vegas Vacation) (1997), 1.2*
Jesus of Montreal (Jésus de Montréal) (1989), 1.2*
Into the Blue (2005), 1.4*
Freezer (2014), 1.4*


## Part 2 - simple algorithms

In [7]:
# define an interface for all recommending algorithms

class Recommender:
    def recommend(self, movies, ratings, group, size):
        pass

# random algorithm - baseline and comparison
    
class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'
        
    def recommend(self, movies, ratings, group, size):
        return sample(movies, size)

In [8]:
# recommend movies with the highest average rating

class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'
    
    def recommend(self, movies, ratings, group, size):
        return ratings.loc[group,].mean().sort_values(ascending=False)[:size].index   

In [64]:
# highest average rating but without movies with at least one score below the threshold

class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = 'average_without_misery'
        self.score_threshold = score_threshold
        
    def recommend(self, movies, ratings, group, size):
        rating = ratings.loc[group,]
        idx = rating.columns[(rating<self.score_threshold).any(axis=0)]
        return rating.loc[:,idx].mean().sort_values(ascending=False)[:min(size,len(idx))].index

In [None]:
# preferences of only one user in every iteration

class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'
    
    def recommend(self, movies, ratings, group, size):
        preference = {g : ratings.loc[g,].sort_values(ascending=False).index for g in group}
        idx = {g : 0 for g in group}
        result = set()
        for i in range(size):
            g = groups[i%size]
            while preference[g][idx[g]] in result:
                idx[g]+=1
            result += preference[g][idx[g]]
        return list(result)

In [None]:
# election algorithms (dictatorship, Bord, Copeland, simple vote)

class VotingRecommender(Recommender):
    def __init__(self):
        self.name = "Bord"
    
    def recommend(self, movies, ratings, group, size):
        rating = ratings.loc[group,]
        full_range = np.arange(ratings.shape[1])
        for g in group:
            rating.loc[g,rating.loc[g,].sort_values().index] = full_range
        return rating.sum(axis=0).sort_values(ascending=False)[:size].index

In [None]:
# greedy algorithm, approximate Proportional Approval Voting
#in every iteration chose the movie that improves the PAV score

class ProportionalApprovalVotingRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = 'PAV'
        
    def recommend(self, movies, ratings, group, size):
        raise NotImplementedError()

## Part 3 - Objective function

In [None]:
# 2 helper functions
# find user's favourite movies
# calculate the sum of ratings given by a user for all movies in a give recommendation

def top_n_movies_for_user(ratings, movies, user_id, n):
    raise NotImplementedError()

def total_score(recommendation, user_id, ratings):
    raise NotImplementedError()

In [None]:
# function to calculate the satisfaction score for each user
# fraction - satisfaction from generated recommendation/satisfaction from ideal recommendation
def overall_user_satisfaction(recommendation, user_id, movies, ratings):
    raise NotImplementedError()

# objective function - average satisfaction for all users in a group
def overall_group_satisfaction(recommendation, group, movies, ratings):
    raise NotImplementedError()

# objective function - difference between the highest and lowest satisfaction score
def group_disagreement(recommendation, group, movies, ratings):
    raise NotImplementedError()

## Part 4 - Sequential Hybrid Aggregation

In [None]:
# Algorithm to balance between highest scored elements and highest minimal score
# in each iteration calcuate the alpha parameter
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = 'sequential_hybrid_aggregation'
    
    def recommend(self, movies, ratings, group, size):
        raise NotImplementedError()

## Part 5 - Algorithm comparison

In [None]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(),
    ProportionalApprovalVotingRecommender(5),
    SequentialHybridAggregationRecommender()
]

recommendation_size = 10

# for each algorithm:
#  - generate one recommendation for each group
#  - calculatevalue of both objective functions for each recommendation
#  - calculate mean and standard deviation for both objective functions
#  - print the results


for recommender in recommenders:
    raise NotImplementedError()