# Group Recommender System

First import librabries that are needed and read dataset.

In [130]:
import pandas as pd
import numpy as np

In [131]:
books_df = pd.read_csv('datasets/valid_only/1/2/books.csv', sep=',', error_bad_lines=False, encoding="cp1252")
users = pd.read_csv('datasets/valid_only/1/2/users.csv', sep=',', error_bad_lines=False, encoding="cp1252")
ratings = pd.read_csv('datasets/valid_only/1/2/ratings.csv', sep=',', error_bad_lines=False, encoding="cp1252")
books_df = books_df.astype({'ISBN': 'string', 'Book-Title': 'string', 'Book-Author': 'string', 'Publisher': 'string', 'Year-Of-Publication': np.uint32})
ratings.columns = ['index', 'ISBN', 'userID', 'bookRating']
books_df.columns = ['index', 'ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher']

In [132]:
merged_rows = pd.merge(books_df, ratings, on='ISBN', how='inner')
merged_rows

Unnamed: 0,index_x,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,index_y,userID,bookRating
0,0,034545104X,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,0,2313,5
1,0,034545104X,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,1,77480,8
2,0,034545104X,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,2,94362,5
3,0,034545104X,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,3,98391,9
4,0,034545104X,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,4,112199,6
...,...,...,...,...,...,...,...,...,...
220813,220813,0312273150,All About All About Eve: The Complete Behind-T...,Sam Staggs,2001,St. Martin's Press,220813,272715,10
220814,220813,0312273150,All About All About Eve: The Complete Behind-T...,Sam Staggs,2001,St. Martin's Press,220814,276263,5
220815,220815,0679449132,Virtuous Reality: How America Surrendered Disc...,Jon Katz,1997,Random House,220815,275383,8
220816,220815,0679449132,Virtuous Reality: How America Surrendered Disc...,Jon Katz,1997,Random House,220816,275737,5


Sort the books based on the number of ratings they received

In [133]:
ratings_most_to_least = ratings.groupby(['ISBN'])['bookRating'].count().reset_index(
  name='Count').sort_values(['Count'], ascending=False)

In [134]:
ratings_most_to_least

Unnamed: 0,ISBN,Count
61792,0971880107,471
11205,0316666343,468
22027,0385504209,372
8939,0312195516,227
39895,059035342X,220
...,...,...
33167,0451524195,1
33169,0451524217,1
33174,0451524330,1
33176,0451524365,1


Take the top N books and remove the rest.

In [135]:
number_of_books = 15
ratings_most_to_least = ratings_most_to_least.iloc[0:number_of_books , :]

In [136]:
ratings_most_to_least

Unnamed: 0,ISBN,Count
61792,0971880107,471
11205,0316666343,468
22027,0385504209,372
8939,0312195516,227
39895,059035342X,220
42383,0671027360,219
6652,0142001740,217
52814,0786868716,202
2124,0060928336,201
29913,0446672211,194


Now remove the ratings that are not needed anymore.

In [137]:
ratings_only_top = ratings[ratings['ISBN'].isin(ratings_most_to_least['ISBN'])]

In [138]:
ratings_only_top

Unnamed: 0,index,ISBN,userID,bookRating
507,507,043935806X,278356,10
508,508,043935806X,2033,9
509,509,043935806X,6251,9
510,510,043935806X,6431,10
511,511,043935806X,6563,10
...,...,...,...,...
30905,30905,0345337662,271367,9
30906,30906,0345337662,271538,7
30907,30907,0345337662,271558,9
30908,30908,0345337662,272573,7


Select users that have rated at least 5 of these selected books. This needs to be done to be able to fill in the missing values, because the table is very sparse.

In [139]:
count = ratings_only_top['userID'].value_counts()
users_rated_all = ratings_only_top[ratings_only_top['userID'].isin(count[count >= 5].index)]

In [140]:
users_rated_all

Unnamed: 0,index,ISBN,userID,bookRating
513,513,043935806X,10560,10
518,518,043935806X,22625,10
566,566,043935806X,95359,10
594,594,043935806X,136491,9
603,603,043935806X,147141,10
...,...,...,...,...
30822,30822,0345337662,136491,9
30828,30828,0345337662,147141,10
30875,30875,0345337662,229313,10
30877,30877,0345337662,229741,7


Create dataframe including all the books and users, filling it with all the known and unknown ratings.

In [141]:
ratings_pivot = users_rated_all.pivot(index='ISBN', columns='userID').bookRating
userID = ratings_pivot.columns
ISBN = ratings_pivot.index
print(ratings_pivot.shape)
ratings_pivot = ratings_pivot.iloc[: , 0:10]
ratings_pivot

(15, 15)


userID,10560,16795,22625,62862,88693,95359,136491,147141,204864,220688
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0060928336,,,,6.0,,,,7.0,9.0,
0142001740,,10.0,9.0,,,,10.0,,,
0156027321,,,,,,,,,10.0,
0312195516,9.0,9.0,,10.0,7.0,7.0,,,10.0,
0316666343,8.0,10.0,10.0,6.0,,9.0,,6.0,9.0,
0316769487,,8.0,10.0,,,8.0,10.0,,10.0,
0345337662,,,,,,,9.0,10.0,,
0385504209,10.0,10.0,,,8.0,,10.0,,,10.0
043935806X,10.0,,10.0,,,10.0,9.0,10.0,,10.0
0446672211,,8.0,,10.0,,,,,,


Next find the correleation between two users based on the books they have rated using Pearson correlation. If two users give the exact same ratings, the correlation is 1. If the users give opposite ratings, the correlation is -1.

In [142]:
def find_correlation_between_two_users(ratings_df: pd.DataFrame, user1: str, user2: str):
    rated_books_by_both_users = ratings_df[[user1, user2]].dropna(axis=0).values
    user_1_ratings = rated_books_by_both_users[:, 0]
    user_2_ratings = rated_books_by_both_users[:, 1]
    return np.corrcoef(user_1_ratings, user_2_ratings)[0, 1]

Using these correlations, build the correlations matrix.

In [143]:
users = list(ratings_pivot.columns)
books = list(ratings_pivot.index)
similarity_matrix = np.array([[find_correlation_between_two_users(ratings_pivot, user1, user2) for user1 in users] for user2 in users])
similarity_df = pd.DataFrame(similarity_matrix, columns=users, index=users)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  avg = a.mean(axis)
  ret = um.true_divide(
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  avg = a.mean(axis)
  ret = um.true_divide(
 

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  avg = a.mean(axis)
  ret = um.true_divide(
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[:, None]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  avg = a.mean(axis)
  ret = um.true_divide(
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
 

In [144]:
similarity_df

Unnamed: 0,10560,16795,22625,62862,88693,95359,136491,147141,204864,220688
10560,1.0,0.0,,0.866025,0.866025,0.207514,,0.5,1.0,
16795,0.0,1.0,0.0,-0.866025,1.0,0.5,,,-0.866025,
22625,,0.0,1.0,,,,-0.5,,,
62862,0.866025,-0.866025,,1.0,,-0.866025,,,1.0,
88693,0.866025,1.0,,,1.0,,,,,1.0
95359,0.207514,0.5,,-0.866025,,1.0,-1.0,1.0,-0.866025,
136491,,,-0.5,,,-1.0,1.0,,,
147141,0.5,,,,,1.0,,1.0,,
204864,1.0,-0.866025,,1.0,,-0.866025,,,1.0,
220688,,,,,1.0,,,,,1.0


Next, define some methods that help with predicting the rating a user gives a certain book.

In [145]:
def get_rated_user_for_a_book(ratings_df: pd.DataFrame, book: str):
    return ratings_df.loc[book, :].dropna().index.values


def get_top_neighbors(
    similarity_df: pd.DataFrame, user: str, rated_users: str, n_neighbors: int
):
    return similarity_df[user][rated_users].nlargest(n_neighbors).to_dict()


def subtract_bias(rating: float, mean_rating: float):
    return rating - mean_rating

# To eleminate bias, subtract rating of the user from the mean rating of the user
def get_neighbor_rating_without_bias_per_book(
    ratings_df: pd.DataFrame, user: str, book: str
):
    mean_rating = ratings_df[user].mean()
    rating = ratings_df.loc[book, user]
    return subtract_bias(rating, mean_rating)


def get_ratings_of_neighbors(ratings_df: pd.DataFrame, neighbors: list, book: str):
    return [
        get_neighbor_rating_without_bias_per_book(ratings_df, neighbor, book)
        for neighbor in neighbors
    ]

def get_weighted_average_rating_of_neighbors(ratings: list, neighbor_distance: list):
    weighted_sum = np.array(ratings).dot(np.array(neighbor_distance))
    abs_neigbor_distance = np.abs(neighbor_distance)
    return weighted_sum / np.sum(abs_neigbor_distance)


def ger_user_rating(ratings_df: pd.DataFrame, user: str, avg_neighbor_rating: float):
    user_avg_rating = ratings_df[user].mean()
    return round(user_avg_rating + avg_neighbor_rating, 2)

Now, predict the rating of the user for a certain book based on the rating of the neighbors, using the methods defined above.

In [146]:
def predict_rating(
    df: pd.DataFrame,
    similarity_df: pd.DataFrame,
    user: str,
    book: str,
    n_neighbors: int = 2,
):

    ratings_df = df.copy()

    rated_users = get_rated_user_for_a_book(ratings_df, book)

    top_neighbors_distance = get_top_neighbors(
        similarity_df, user, rated_users, n_neighbors
    )
    neighbors, distance = top_neighbors_distance.keys(), top_neighbors_distance.values()

    ratings = get_ratings_of_neighbors(ratings_df, neighbors, book)
    avg_neighbor_rating = get_weighted_average_rating_of_neighbors(
        ratings, list(distance)
        
    )

    return ger_user_rating(ratings_df, user, avg_neighbor_rating)

In [147]:
full_ratings = ratings_pivot.copy()

for user, books in full_ratings.iteritems():
    for book in books.keys():
        if np.isnan(full_ratings.loc[book, user]):
            final_rating = predict_rating(
                ratings_pivot, similarity_df, user, book
            )
            if final_rating > 10:
                final_rating = 10
            full_ratings.loc[book, user] = final_rating

  return weighted_sum / np.sum(abs_neigbor_distance)


In [148]:
full_ratings

userID,10560,16795,22625,62862,88693,95359,136491,147141,204864,220688
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0060928336,7.92,10.0,,6.0,,9.1,,7.0,9.0,
0142001740,,10.0,9.0,7.0,7.8,8.71,10.0,,8.6,
0156027321,9.57,8.6,,8.4,,8.2,,,10.0,
0312195516,9.0,9.0,,10.0,7.0,7.0,10.0,6.68,10.0,8.8
0316666343,8.0,10.0,10.0,6.0,6.79,9.0,9.29,6.0,9.0,
0316769487,9.39,8.0,10.0,8.49,5.8,8.0,10.0,7.2,10.0,
0345337662,10.0,,10.0,,,10.0,9.0,10.0,,
0385504209,10.0,10.0,9.33,7.92,8.0,9.55,10.0,8.63,9.58,10.0
043935806X,10.0,10.0,10.0,7.72,7.94,10.0,9.0,10.0,9.4,10.0
0446672211,10.0,8.0,,10.0,5.8,6.97,,,10.0,


Almost the complete table is filled out. If there are missing values, this is because the data is too sparse. Now we can calculate the mean, min and max. To give a recommendation we can order them based on the average rating.

In [149]:
full_ratings['mean'] = full_ratings.mean(axis=1)
full_ratings['max'] = full_ratings.max(axis=1)
full_ratings['min'] = full_ratings.min(axis=1)
full_ratings['ISBN'] = full_ratings.index
full_ratings['bookTitle'] = full_ratings['ISBN'].map(books_df.set_index('ISBN')['bookTitle'])
full_ratings.sort_values('mean', axis=0, ascending=False)

userID,10560,16795,22625,62862,88693,95359,136491,147141,204864,220688,mean,max,min,ISBN,bookTitle
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0345337662,10.0,,10.0,,,10.0,9.0,10.0,,,9.8,10.0,9.0,0345337662,Interview with the Vampire
043935806X,10.0,10.0,10.0,7.72,7.94,10.0,9.0,10.0,9.4,10.0,9.406,10.0,7.72,043935806X,Harry Potter and the Order of the Phoenix (Boo...
0452282152,10.0,10.0,9.33,,8.0,8.27,10.0,,,10.0,9.371429,10.0,8.0,0452282152,Girl with a Pearl Earring
0385504209,10.0,10.0,9.33,7.92,8.0,9.55,10.0,8.63,9.58,10.0,9.301,10.0,7.92,0385504209,The Da Vinci Code
0156027321,9.57,8.6,,8.4,,8.2,,,10.0,,8.954,10.0,8.2,0156027321,Life of Pi
059035342X,10.0,,10.0,8.83,7.94,7.25,9.33,6.0,10.0,10.0,8.816667,10.0,6.0,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...
0142001740,,10.0,9.0,7.0,7.8,8.71,10.0,,8.6,,8.73,10.0,7.0,0142001740,The Secret Life of Bees
0312195516,9.0,9.0,,10.0,7.0,7.0,10.0,6.68,10.0,8.8,8.608889,10.0,6.68,0312195516,The Red Tent (Bestselling Backlist)
0316769487,9.39,8.0,10.0,8.49,5.8,8.0,10.0,7.2,10.0,,8.542222,10.0,5.8,0316769487,The Catcher in the Rye
0786868716,,8.0,9.0,9.0,5.8,7.6,10.0,,10.0,,8.485714,10.0,5.8,0786868716,The Five People You Meet in Heaven


For the least misery method, the books are ordered based on the minimum rating.

In [150]:
full_ratings.sort_values('min', axis=0, ascending=False)

userID,10560,16795,22625,62862,88693,95359,136491,147141,204864,220688,mean,max,min,ISBN,bookTitle
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0345337662,10.0,,10.0,,,10.0,9.0,10.0,,,9.8,10.0,9.0,0345337662,Interview with the Vampire
0156027321,9.57,8.6,,8.4,,8.2,,,10.0,,8.954,10.0,8.2,0156027321,Life of Pi
0452282152,10.0,10.0,9.33,,8.0,8.27,10.0,,,10.0,9.371429,10.0,8.0,0452282152,Girl with a Pearl Earring
0385504209,10.0,10.0,9.33,7.92,8.0,9.55,10.0,8.63,9.58,10.0,9.301,10.0,7.92,0385504209,The Da Vinci Code
043935806X,10.0,10.0,10.0,7.72,7.94,10.0,9.0,10.0,9.4,10.0,9.406,10.0,7.72,043935806X,Harry Potter and the Order of the Phoenix (Boo...
0142001740,,10.0,9.0,7.0,7.8,8.71,10.0,,8.6,,8.73,10.0,7.0,0142001740,The Secret Life of Bees
0671027360,8.0,9.27,,8.0,7.0,9.0,9.27,7.68,9.02,8.8,8.448889,9.27,7.0,0671027360,Angels &amp; Demons
0312195516,9.0,9.0,,10.0,7.0,7.0,10.0,6.68,10.0,8.8,8.608889,10.0,6.68,0312195516,The Red Tent (Bestselling Backlist)
0060928336,7.92,10.0,,6.0,,9.1,,7.0,9.0,,8.17,10.0,6.0,0060928336,Divine Secrets of the Ya-Ya Sisterhood: A Novel
0316666343,8.0,10.0,10.0,6.0,6.79,9.0,9.29,6.0,9.0,,8.231111,10.0,6.0,0316666343,The Lovely Bones: A Novel


The explanations can be derived from the complete table. The users are anonymous so everyone can see how a certain user rated all the books. By being able to see the mean, maximum and minimum rating of all the books from all users it is clear why a certain recommendation was given.