# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas scipy matplotlib`

## Część 1. - przygotowanie danych

In [349]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds


from random import choice, sample
from statistics import mean, stdev
from collections import defaultdict

In [350]:
PATH = 'ml-latest-small'

In [351]:
# wczytujemy oceny uzytkownikow i obliczamy (za pomoc dekompozycji macierzy) wszystkie przewidywane oceny filmow

def read_ratings(path, k=600, scale_factor=2.0, print_stats=True):
    # idea: https://www.kaggle.com/code/indralin/movielens-project-1-2-collaborative-filtering
    reviews = pd.read_csv(f'{path}/ratings.csv', names=['userId', 'movieId', 'rating', 'time'], delimiter=',', engine='python', skiprows=1)
    
    reviews.drop(['time'], axis=1, inplace=True)
    reviews_no, _ = reviews.shape
    reviews_matrix = reviews.pivot(index='userId', columns='movieId', values='rating')
    movies = reviews_matrix.columns
    users = reviews_matrix.index
    users_no, movies_no = reviews_matrix.shape
    print(f'Got {reviews_no} reviews for {movies_no} movies and {users_no} users.')

    user_ratings_mean = np.nanmean(reviews_matrix.values, axis=1)
    normalized_reviews_matrix = np.nan_to_num(reviews_matrix.values - user_ratings_mean.reshape(-1, 1), 0.0)

    U, sigma, Vt = svds(normalized_reviews_matrix, k=k)
    sigma = np.diag(sigma)
    predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1).clip(0.5, 5.0)
    mean_square_error = np.nanmean(np.square(predicted_ratings - reviews_matrix.values))
    std_square_error = np.nanstd(np.square(predicted_ratings - reviews_matrix.values))
    print(f'Reviews prediction mean square error = {mean_square_error}')
    print(f'Reviews prediction standatd deviation of square error = {std_square_error}')

    if print_stats:
        stats = [
            ('metric', 'dataset', 'prediction'),
            ('avg', np.nanmean(reviews_matrix), np.mean(predicted_ratings)),
            ('st_dev', np.nanstd(reviews_matrix), np.std(predicted_ratings)),
            ('median', np.nanmedian(reviews_matrix), np.median(predicted_ratings)),
            ('p25', np.nanquantile(reviews_matrix, 0.25), np.quantile(predicted_ratings, 0.25)),
            ('p75', np.nanquantile(reviews_matrix, 0.75), np.quantile(predicted_ratings, 0.75))
        ]
        print('Stats (for raings in original range [0.5, 5.0]):')
        print('\n'.join([str(s) for s in stats]))

    rounded_predictions = np.rint(scale_factor * predicted_ratings) # cast values to {1, 2, ..., 10}
    return pd.DataFrame(data=rounded_predictions, index=list(users), columns=list(movies))
    
ratings = read_ratings(PATH)
# dostep do danych:
# ratings[movieId][userId] pobiera 1 wartosc
# ratings.loc[:, movieId] pobiera wektor dla danego filmu
# ratings.loc[userId, :] pobiera wektor dla danego uzytkownika
ratings

Got 100836 reviews for 9724 movies and 610 users.
Reviews prediction mean square error = 1.6577787842924655e-05
Reviews prediction standatd deviation of square error = 0.0007928950536518336
Stats (for raings in original range [0.5, 5.0]):
('metric', 'dataset', 'prediction')
('avg', np.float64(3.501556983616962), np.float64(3.657222337747399))
('st_dev', np.float64(1.042524069618056), np.float64(0.49546237560971024))
('median', np.float64(3.5), np.float64(3.705224008811769))
('p25', np.float64(3.0), np.float64(3.357451718316403))
('p75', np.float64(4.0), np.float64(3.9999816268830015))


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,8.0,9.0,8.0,9.0,9.0,8.0,9.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
2,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
3,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
4,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
5,8.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,5.0,7.0,7.0,7.0,7.0,7.0,5.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
607,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
608,5.0,4.0,4.0,6.0,6.0,6.0,6.0,6.0,6.0,8.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
609,6.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,8.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


In [352]:
# wczytujemy nazwy filmow i kategorie

movies_metadata = pd.read_csv('ml-latest-small/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [353]:
# wczytujemy przykladowe grupy uzytkownikow
groups = pd.read_csv('groups.csv').values.tolist()
groups

[[111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [354]:
# przygotowujemy funkcje pomocnicza

def describe_group(group, N=10):
    print(f'\n\nUser ids: {group}')
    group_size = len(group)
    
    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')
    
    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[-N:].index)]
    worst_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[:N].index)]
    
    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')

describe_group(groups[5])



User ids: [269, 360, 469, 287, 308]

Mean ratings deviation: 1.1259149574579788
Median ratings deviation: 1.0954451150103321
Standard deviation of ratings deviation: 0.17836724055768716

Best movies:
Forrest Gump (1994), 8.2*
Toy Story (1995), 8.2*
Braveheart (1995), 8.0*
Willy Wonka & the Chocolate Factory (1971), 8.0*
Terminator 2: Judgment Day (1991), 7.8*
Schindler's List (1993), 7.8*
Shawshank Redemption, The (1994), 7.6*
Twelve Monkeys (a.k.a. 12 Monkeys) (1995), 7.6*
Nixon (1995), 7.6*
Dances with Wolves (1990), 7.6*

Worst movies:
Broken Arrow (1996), 5.2*
Cable Guy, The (1996), 5.4*
Sleepy Hollow (1999), 5.4*
The Devil's Advocate (1997), 5.4*
Nutty Professor, The (1996), 5.6*
Mission: Impossible (1996), 5.6*
Matrix Revolutions, The (2003), 5.8*
Cheech & Chong's The Corsican Brothers (1984), 5.8*
Hellbound: Hellraiser II (1988), 5.8*
Masquerade (1988), 5.8*


## Część 2. - algorytmy proste

In [355]:
# zdefiniujmy interfejs dla wszystkich algorytmow rekomendacyjnych

class Recommender:
    def recommend(self, movies, ratings, group, size):
        pass


# jako pierwszy zaimplementujemy algorytm losowy - dla porownania
    
class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'
        
    def recommend(self, movies, ratings, group, size):
        return movies.sample(n = size)

In [356]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen

class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'
    
    def recommend(self, movies, ratings, group, size):
        best_ratings_ids = (
            ratings.loc[group]
                .mean()
                .sort_values(ascending = False)
                .head(size)
                .index
        )
        return movies.loc[best_ratings_ids]
    #
#

In [357]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen,
# ale rownoczesnie wykluczajacy te filmy, ktore otrzymaly choc jedna ocene ponizej thresholdu

class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = 'average_without_misery'
        self.score_threshold = score_threshold
        
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group]
        best_ratings_ids = (
            group_ratings
                .loc[:, (group_ratings >= self.score_threshold).all()]
                .mean()
                .sort_values(ascending = False)
                .head(size)
                .index
        )
        return movies.loc[best_ratings_ids]
    #
#

In [358]:
# algorytm uwzgledniajacy preferencje tylko jednego uzytkownika w kazdej iteracji

class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'
        
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group]
        
        best_ratings_ids = []
        used_columns = set()

        for _, row in group_ratings.iterrows():
            sorted_columns = row.sort_values(ascending = False).index
            for column in sorted_columns:
                if column not in used_columns:
                    best_ratings_ids.append(column)
                    used_columns.add(column)
                    break
            #
        #

        return movies.loc[best_ratings_ids]
    #
#

In [359]:
# wybrany algorytm wyborczy (dyktatura, Borda, Copeland)

class VotingRecommender(Recommender):
    def __init__(self):
        self.name = 'bordy'
    
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group]
        points_per_movie_id_dict = defaultdict(lambda: 0)

        for _, row in group_ratings.iterrows():
            sorted_columns = row.sort_values(ascending = False).index
            point = 0

            for column in sorted_columns:
                points_per_movie_id_dict[column] += point
                point += 1
            #
        #

        movie_ids_with_points = list(points_per_movie_id_dict.items())
        movie_ids_with_points = sorted(movie_ids_with_points, key = lambda movie: movie[1], reverse = True)[:size]
        
        best_ratings_ids = [movie[0] for movie in movie_ids_with_points]
        return movies.loc[best_ratings_ids]
    #           
#

In [366]:
# algorytm zachlanny, aproksymujacy metode Proportional Approval Voting
# w kazdej iteracji wybieramy ten film, ktory najbardziej zwieksza zadowolenie zgodnie z punktacja PAV

class ProportionalApprovalVotingRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = 'PAV'
    
    def calculate_movies_satisfaction(self, movie, user_satisfaction_dict):
        result = 0.0

        for idx, rating in movie.items():
            result += ( 1 / user_satisfaction_dict[idx] ) if rating >= self.threshold else 0
            user_satisfaction_dict[idx] += 1
        #
        return result
    #

    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group]
        
        best_ratings_ids = []
        used_movie_ids = set()
        user_satisfaction_dict = defaultdict(lambda: 1)

        while len(used_movie_ids) < size:
            current_best_movies = group_ratings.apply(self.calculate_movies_satisfaction, args = (user_satisfaction_dict,)).sort_values(ascending = False).index.tolist()
            for movie_id in current_best_movies:
                if not movie_id in used_movie_ids:
                    used_movie_ids.add(movie_id)
                    best_ratings_ids.append(movie_id)
        #

        return movies.loc[best_ratings_ids]
    #
#

## Część 3. - funkcje celu

In [361]:
# dwie funkcje pomocnicze:
#  - znajdujaca ulubione filmy danego uzytkownika
#  - obliczajaca sume ocen wystawionych przez uzytkownika wszystkim filmom w rekomendacji

def top_n_movies_for_user(ratings, movies, user_id, n):
    best_ratings_ids = ratings.loc[user_id].sort_values(ascending = False).head(n).index
    return movies.loc[best_ratings_ids]

def total_score(recommendation, user_id, ratings):
    movie_ids = recommendation.index.tolist() if isinstance(recommendation, pd.DataFrame) else recommendation
    return sum([ratings[movie_id][user_id] for movie_id in movie_ids])

In [362]:
# funkcja obliczajaca zadowolenie pojedynczego uzytkownika
#  - iloraz zadowolenia z wygenerowanej rekomendacji oraz zadowolenia z hipotetycznej rekomendacji idealnej
def overall_user_satisfaction(recommendation, user_id, movies, ratings):
    return (
        total_score(recommendation, user_id, ratings) 
        / 
        total_score(top_n_movies_for_user(ratings, movies, user_id, len(recommendation)), user_id, ratings)
    )

# funkcja celu - srednia z zadowolenia wszystkich uzytkownikow w grupie
def overall_group_satisfaction(recommendation, group, movies, ratings):
    return mean([overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group])

# funkcja celu - roznica miedzy maksymalnym i minimalnym zadowolenie w grupie
def group_disagreement(recommendation, group, movies, ratings):
    group_satisfaction = [overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group]
    return max(group_satisfaction) - min(group_satisfaction)

## Część 4. - Sequential Hybrid Aggregation

In [369]:
# algorytm balansujacy pomiedzy wyborem elementow o najwyzszej sredniej ocen
#   i o najwyzszej minimalnej ocenie
#   wyliczajacy w kazdej iteracji parametr alfa - jak na wykladzie
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = 'sequential_hybrid_aggregation'
    
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group]
        avg_score = group_ratings.mean(axis=0)
        least_score = group_ratings.min()

        alpha = 1

        score = {movie: 0 for movie in ratings.columns}

        recommendation = []

        for _ in range(size):
            score.update(
                {movie: (1 - alpha) * avg_score.loc[movie] + alpha * least_score.loc[movie] for movie in movies})

            score.update({movie: -1 for movie in recommendation})

            max_score_movie = max(score.items(), key=lambda k: k[1])[0]
            recommendation.append(max_score_movie)

            alpha = group_disagreement(recommendation, group, movies, ratings)
        #
        return movies.loc[recommendation]
    #
#

## Część 5. - porównanie algorytmów

In [370]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(),
    ProportionalApprovalVotingRecommender(5),
    SequentialHybridAggregationRecommender()
]

recommendation_size = 10

In [371]:
# dla kazdego algorytmu:
#  - wygenerujmy jedna rekomendacje dla kazdej grupy
#  - obliczmy wartosci obu funkcji celu dla kazdej rekomendacji
#  - wypiszmy wyniki na konsole

for recommender in recommenders:
    print(f'\n\n{recommender.name}')
    overall_group_satisfactions = []
    group_disagreements = []
    for group in groups:
        recommendation = recommender.recommend(movies_metadata, ratings, group, recommendation_size)
        overall_group_satisfaction_value = overall_group_satisfaction(recommendation, group, movies_metadata, ratings)
        group_disagreement_value = group_disagreement(recommendation, group, movies_metadata, ratings)
        overall_group_satisfactions.append(overall_group_satisfaction_value)
        group_disagreements.append(group_disagreement_value)
        # print(f'\nRecommendation: {recommendation}')
        print(f'Overall group satisfaction: {overall_group_satisfaction_value}')
        print(f'Group disagreement: {group_disagreement_value}')
    print(f'\nMean overall group satisfaction: {mean(overall_group_satisfactions)}')
    print(f'Standard deviation of overall group satisfaction: {stdev(overall_group_satisfactions)}')
    print(f'Mean group disagreement: {mean(group_disagreements)}')
    print(f'Standard deviation of group disagreement: {stdev(group_disagreements)}')



random
Overall group satisfaction: 0.632
Group disagreement: 0.21999999999999997
Overall group satisfaction: 0.6639999999999999
Group disagreement: 0.09999999999999998
Overall group satisfaction: 0.7589003436426117
Group disagreement: 0.26116838487972516
Overall group satisfaction: 0.8697804576376005
Group disagreement: 0.24242424242424243
Overall group satisfaction: 0.8287528344671202
Group disagreement: 0.12222222222222223
Overall group satisfaction: 0.6384126984126984
Group disagreement: 0.2777777777777778
Overall group satisfaction: 0.7956643356643357
Group disagreement: 0.2090909090909091
Overall group satisfaction: 0.765
Group disagreement: 0.275

Mean overall group satisfaction: 0.7440638337280459
Standard deviation of overall group satisfaction: 0.08978437340378292
Mean group disagreement: 0.2134604420493596
Standard deviation of group disagreement: 0.06792677750232198


average
Overall group satisfaction: 0.886
Group disagreement: 0.22999999999999998
Overall group satisfacti

KeyError: 'title'