# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas scipy matplotlib`

## Część 1. - przygotowanie danych

In [82]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds


from random import choice, sample
from statistics import mean, stdev
from collections import defaultdict

In [83]:
PATH = 'ml-latest-small'

In [84]:
# wczytujemy oceny uzytkownikow i obliczamy (za pomoc dekompozycji macierzy) wszystkie przewidywane oceny filmow

def read_ratings(path, k=600, scale_factor=2.0, print_stats=True):
    # idea: https://www.kaggle.com/code/indralin/movielens-project-1-2-collaborative-filtering
    reviews = pd.read_csv(f'{path}/ratings.csv', names=['userId', 'movieId', 'rating', 'time'], delimiter=',', engine='python', skiprows=1)
    
    reviews.drop(['time'], axis=1, inplace=True)
    reviews_no, _ = reviews.shape
    reviews_matrix = reviews.pivot(index='userId', columns='movieId', values='rating')
    movies = reviews_matrix.columns
    users = reviews_matrix.index
    users_no, movies_no = reviews_matrix.shape
    print(f'Got {reviews_no} reviews for {movies_no} movies and {users_no} users.')

    user_ratings_mean = np.nanmean(reviews_matrix.values, axis=1)
    normalized_reviews_matrix = np.nan_to_num(reviews_matrix.values - user_ratings_mean.reshape(-1, 1), 0.0)

    U, sigma, Vt = svds(normalized_reviews_matrix, k=k)
    sigma = np.diag(sigma)
    predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1).clip(0.5, 5.0)
    mean_square_error = np.nanmean(np.square(predicted_ratings - reviews_matrix.values))
    std_square_error = np.nanstd(np.square(predicted_ratings - reviews_matrix.values))
    print(f'Reviews prediction mean square error = {mean_square_error}')
    print(f'Reviews prediction standatd deviation of square error = {std_square_error}')

    if print_stats:
        stats = [
            ('metric', 'dataset', 'prediction'),
            ('avg', np.nanmean(reviews_matrix), np.mean(predicted_ratings)),
            ('st_dev', np.nanstd(reviews_matrix), np.std(predicted_ratings)),
            ('median', np.nanmedian(reviews_matrix), np.median(predicted_ratings)),
            ('p25', np.nanquantile(reviews_matrix, 0.25), np.quantile(predicted_ratings, 0.25)),
            ('p75', np.nanquantile(reviews_matrix, 0.75), np.quantile(predicted_ratings, 0.75))
        ]
        print('Stats (for raings in original range [0.5, 5.0]):')
        print('\n'.join([str(s) for s in stats]))

    rounded_predictions = np.rint(scale_factor * predicted_ratings) # cast values to {1, 2, ..., 10}
    return pd.DataFrame(data=rounded_predictions, index=list(users), columns=list(movies))
    
ratings = read_ratings(PATH)
# dostep do danych:
# ratings[movieId][userId] pobiera 1 wartosc
# ratings.loc[:, movieId] pobiera wektor dla danego filmu
# ratings.loc[userId, :] pobiera wektor dla danego uzytkownika
ratings

Got 100836 reviews for 9724 movies and 610 users.
Reviews prediction mean square error = 1.6577787842924512e-05
Reviews prediction standatd deviation of square error = 0.0007928950536518275
Stats (for raings in original range [0.5, 5.0]):
('metric', 'dataset', 'prediction')
('avg', 3.501556983616962, 3.657222337747399)
('st_dev', 1.042524069618056, 0.49546237560971024)
('median', 3.5, 3.705224008811769)
('p25', 3.0, 3.3574517183164025)
('p75', 4.0, 3.999981626883001)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,8.0,9.0,8.0,9.0,9.0,8.0,9.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
2,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
3,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
4,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
5,8.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,5.0,7.0,7.0,7.0,7.0,7.0,5.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
607,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
608,5.0,4.0,4.0,6.0,6.0,6.0,6.0,6.0,6.0,8.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
609,6.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,8.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


In [85]:
# wczytujemy nazwy filmow i kategorie

movies_metadata = pd.read_csv('ml-latest-small/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [86]:
# wczytujemy przykladowe grupy uzytkownikow
groups = pd.read_csv('groups.csv').values.tolist()
groups

[[111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [87]:
# przygotowujemy funkcje pomocnicza

def describe_group(group, N=10):
    print(f'\n\nUser ids: {group}')
    group_size = len(group)
    
    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')
    
    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[-N:].index)]
    worst_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[:N].index)]
    
    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')

describe_group(groups[5])



User ids: [269, 360, 469, 287, 308]

Mean ratings deviation: 1.1259149574579788
Median ratings deviation: 1.0954451150103321
Standard deviation of ratings deviation: 0.17836724055768716

Best movies:
Toy Story (1995), 8.2*
Forrest Gump (1994), 8.2*
Willy Wonka & the Chocolate Factory (1971), 8.0*
Braveheart (1995), 8.0*
Terminator 2: Judgment Day (1991), 7.8*
Schindler's List (1993), 7.8*
Dances with Wolves (1990), 7.6*
James and the Giant Peach (1996), 7.6*
Dead Man Walking (1995), 7.6*
Nixon (1995), 7.6*

Worst movies:
Broken Arrow (1996), 5.2*
Sleepy Hollow (1999), 5.4*
The Devil's Advocate (1997), 5.4*
Cable Guy, The (1996), 5.4*
Mission: Impossible (1996), 5.6*
Nutty Professor, The (1996), 5.6*
Down Periscope (1996), 5.8*
Cheech and Chong's Up in Smoke (1978), 5.8*
Wrong Man, The (1956), 5.8*
Fog, The (2005), 5.8*


## Część 2. - algorytmy proste

In [88]:
# zdefiniujmy interfejs dla wszystkich algorytmow rekomendacyjnych

class Recommender:
    def recommend(self, movies, ratings, group, size):
        pass


# jako pierwszy zaimplementujemy algorytm losowy - dla porownania
    
class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'
        
    def recommend(self, movies, ratings, group, size):
        return movies.sample(n = size)

In [89]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen

class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'
    
    def recommend(self, movies, ratings, group, size):
        best_ratings_ids = (
            ratings.iloc[group]
                .mean(axis = 0)
                .nlargest(size)
                .index
        )
        return movies.loc[best_ratings_ids]
    #
#

In [90]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen,
# ale rownoczesnie wykluczajacy te filmy, ktore otrzymaly choc jedna ocene ponizej thresholdu

class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = 'average_without_misery'
        self.score_threshold = score_threshold
        
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group]
        best_ratings_ids = (
            group_ratings
                .loc[:, (group_ratings >= self.score_threshold).all()]
                .mean()
                .nlargest(size)
                .index
        )
        return movies.loc[best_ratings_ids]
    #
#

In [91]:
# algorytm uwzgledniajacy preferencje tylko jednego uzytkownika w kazdej iteracji

class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'
        
    # def recommend(self, movies, ratings, group, size):
    #     group_ratings = ratings.loc[group]
        
    #     best_ratings_ids = []
    #     used_columns = set()

    #     for _, row in group_ratings.drop(columns=used_columns).iterrows():
    #         sorted_columns = row.sort_values(ascending = False).index
    #         for column in sorted_columns:
    #             if column not in used_columns:
    #                 best_ratings_ids.append(column)
    #                 used_columns.add(column)
    #                 break            
    #     #

    #     return movies.loc[best_ratings_ids]
    
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group]
        
        best_ratings_ids = []
        used_columns = set()

        for _, row in group_ratings.iterrows():
            available_movies = row.drop(labels=used_columns)
            
            if len(available_movies) > 0:
                movie_id = available_movies.idxmax()
                best_ratings_ids.append(movie_id)
                used_columns.add(movie_id)
        
        return movies.loc[best_ratings_ids]
#

In [92]:
# wybrany algorytm wyborczy (dyktatura, Borda, Copeland)

class VotingRecommender(Recommender):
    def __init__(self):
        self.name = 'borda'
    
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group]
        points_per_movie_id_dict = defaultdict(lambda: 0)

        for _, row in group_ratings.iterrows():
            sorted_columns = row.sort_values().index
            point = 0

            for point, column in enumerate(sorted_columns):
                points_per_movie_id_dict[column] += point
            #
        #

        best_ratings_ids = (
            sorted(points_per_movie_id_dict.items(), 
                   key=lambda x: x[1], 
                   reverse=True)[:size]
        )
        best_ratings_ids = [movie_id for movie_id, _ in best_ratings_ids]
        
        return movies.loc[best_ratings_ids]
#

In [93]:
# algorytm zachlanny, aproksymujacy metode Proportional Approval Voting
# w kazdej iteracji wybieramy ten film, ktory najbardziej zwieksza zadowolenie zgodnie z punktacja PAV

class ProportionalApprovalVotingRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = 'PAV'
    
    def calculate_movies_satisfaction(self, movie, user_satisfaction_dict):
        result = 0.0

        for user_id, rating in movie.items():
            result += (1.0 / user_satisfaction_dict[user_id]) if rating >= self.threshold else 0
        
        return result
    #

    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group]
        
        best_ratings_ids = []
        used_movie_ids = set()
        user_satisfaction_dict = defaultdict(lambda: 1)

        while len(best_ratings_ids) < size:
            current_best_movie_id = (
                group_ratings
                    .drop(columns=list(used_movie_ids))
                    .apply(self.calculate_movies_satisfaction, args=(user_satisfaction_dict,))
                    .idxmax()
            )

            used_movie_ids.add(current_best_movie_id)
            best_ratings_ids.append(current_best_movie_id)
            
            for user_id in group:
                if ratings.loc[user_id, current_best_movie_id] >= self.threshold:
                    user_satisfaction_dict[user_id] += 1
        #
        
        return movies.loc[best_ratings_ids]
    #
#

## Część 3. - funkcje celu

In [94]:
# dwie funkcje pomocnicze:
#  - znajdujaca ulubione filmy danego uzytkownika
#  - obliczajaca sume ocen wystawionych przez uzytkownika wszystkim filmom w rekomendacji

def top_n_movies_for_user(ratings, movies, user_id, n):
    best_ratings_ids = ratings.loc[user_id].sort_values(ascending = False).head(n).index
    return movies.loc[best_ratings_ids]

def total_score(recommendation, user_id, ratings):
    # this function is also used inside SHA and the recommendation is just an list
    movie_ids = recommendation.index.tolist() if isinstance(recommendation, pd.DataFrame) else recommendation
    # return sum([ratings[movie_id][user_id] for movie_id in movie_ids])
    return ratings.loc[user_id, movie_ids].sum()

In [95]:
# funkcja obliczajaca zadowolenie pojedynczego uzytkownika
#  - iloraz zadowolenia z wygenerowanej rekomendacji oraz zadowolenia z hipotetycznej rekomendacji idealnej
def overall_user_satisfaction(recommendation, user_id, movies, ratings):
    return (
        total_score(recommendation, user_id, ratings) 
        / 
        total_score(top_n_movies_for_user(ratings, movies, user_id, len(recommendation)), user_id, ratings)
    )

# funkcja celu - srednia z zadowolenia wszystkich uzytkownikow w grupie
def overall_group_satisfaction(recommendation, group, movies, ratings):
    return mean([overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group])

# funkcja celu - roznica miedzy maksymalnym i minimalnym zadowolenie w grupie
def group_disagreement(recommendation, group, movies, ratings):
    group_satisfaction = [overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group]
    return max(group_satisfaction) - min(group_satisfaction)

## Część 4. - Sequential Hybrid Aggregation

In [96]:
# algorytm balansujacy pomiedzy wyborem elementow o najwyzszej sredniej ocen
#   i o najwyzszej minimalnej ocenie
#   wyliczajacy w kazdej iteracji parametr alfa - jak na wykladzie
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = 'sequential_hybrid_aggregation'
    
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group]
        avg_score = group_ratings.mean(axis=0)
        least_score = group_ratings.min(axis=0)

        alpha = 1
        best_rating_ids = []
        used_movie_ids = set()

        for _ in range(size):
            available_movies = [m for m in ratings.columns if m not in used_movie_ids]
            
            scores = (
                (1 - alpha) * avg_score[available_movies] + 
                alpha * least_score[available_movies]
            )
            
            max_score_movie_id = scores.idxmax()
            best_rating_ids.append(max_score_movie_id)
            used_movie_ids.add(max_score_movie_id)

            alpha = group_disagreement(best_rating_ids, group, movies, ratings)
        
        return movies.loc[best_rating_ids]
    #
#

## Część 5. - porównanie algorytmów

In [97]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(),
    ProportionalApprovalVotingRecommender(5),
    SequentialHybridAggregationRecommender()
]

recommendation_size = 10

In [98]:
# dla kazdego algorytmu:
#  - wygenerujmy jedna rekomendacje dla kazdej grupy
#  - obliczmy wartosci obu funkcji celu dla kazdej rekomendacji
#  - wypiszmy wyniki na konsole

for recommender in recommenders:
    print(recommender.name)
    satisfaction = 0
    dissagrement = 0

    for group in groups:
        recommendation = recommender.recommend(movies_metadata, ratings, group, recommendation_size)

        satisfaction += overall_group_satisfaction(recommendation, group, movies_metadata, ratings)
        dissagrement += group_disagreement(recommendation, group, movies_metadata, ratings)
    #
        
    print(f"Average overall group satisfaction: {satisfaction / len(groups)}")
    print(f"Average group dissagrement: {dissagrement / len(groups)}")
    print("-----------------------------------------------------------")
#

random
Average overall group satisfaction: 0.7428138337280458
Average group dissagrement: 0.21221044204935957
-----------------------------------------------------------
average
Average overall group satisfaction: 0.7753352910487484
Average group dissagrement: 0.25481854628761846
-----------------------------------------------------------
average_without_misery
Average overall group satisfaction: 0.870190664494236
Average group dissagrement: 0.20250899398220823
-----------------------------------------------------------
fairness
Average overall group satisfaction: 0.8028009344314848
Average group dissagrement: 0.1684607428623386
-----------------------------------------------------------
borda
Average overall group satisfaction: 0.7915638337280457
Average group dissagrement: 0.2059604420493596
-----------------------------------------------------------
PAV
Average overall group satisfaction: 0.7569786579910843
Average group dissagrement: 0.21848800505050503
----------------------------