# Group Recommender System

First import librabries that are needed and read dataset.

In [427]:
import pandas as pd
import numpy as np

In [428]:
books_df = pd.read_csv('datasets/valid_only/75/100/books.csv', sep=',', error_bad_lines=False, encoding="latin-1")
users = pd.read_csv('datasets/valid_only/75/100/users.csv', sep=',', error_bad_lines=False, encoding="latin-1")
ratings = pd.read_csv('datasets/valid_only/75/100/ratings.csv', sep=',', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['index', 'userID', 'ISBN', 'bookRating']
books_df.columns = ['index', 'ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher']

Sort the books based on the number of ratings they received

In [429]:
ratings_most_to_least = ratings.groupby(['ISBN'])['bookRating'].count().reset_index(
  name='Count').sort_values(['Count'], ascending=False)

In [430]:
ratings_most_to_least

Unnamed: 0,ISBN,Count
82,16795,127
514,95359,117
319,60244,81
1480,258534,69
878,153662,65
...,...,...
166,31987,1
543,99312,1
1134,198711,1
545,99441,1


Take the top N books and remove the rest.

In [431]:
number_of_books = 15
ratings_most_to_least = ratings_most_to_least.iloc[0:number_of_books , :]

In [432]:
ratings_most_to_least

Unnamed: 0,ISBN,Count
82,16795,127
514,95359,117
319,60244,81
1480,258534,69
878,153662,65
1174,204864,59
31,7346,54
1371,240567,53
422,78973,53
164,31826,51


Now remove the ratings that are not needed anymore.

In [433]:
ratings_only_top = ratings[ratings['ISBN'].isin(ratings_most_to_least['ISBN'])]

In [434]:
ratings_only_top

Unnamed: 0,index,userID,ISBN,bookRating
12,12,0446520802,200226,9
39,39,0425115801,16795,8
40,40,0425115801,31826,10
50,50,0449006522,16795,7
67,67,0553561618,78973,6
...,...,...,...,...
15675,15675,0345444884,126492,10
15703,15703,0451197747,16795,9
15722,15722,051511992X,16795,10
15727,15727,051511992X,153662,9


Select users that have rated at least 5 of these selected books. This needs to be done to be able to fill in the missing values, because the table is very sparse.

In [435]:
count = ratings_only_top['userID'].value_counts()
users_rated_all = ratings_only_top[ratings_only_top['userID'].isin(count[count >= 5].index)]

In [436]:
users_rated_all

Unnamed: 0,index,userID,ISBN,bookRating
727,727,0804106304,7346,9
737,737,0804106304,60244,8
758,758,0804106304,153662,9
766,766,0804106304,200226,5
774,774,0804106304,240567,8
...,...,...,...,...
10218,10218,0312990456,60244,9
10223,10223,0312990456,110934,9
10235,10235,0312990456,240567,9
10239,10239,0312990456,258534,10


Create dataframe including all the books and users, filling it with all the known and unknown ratings.

In [437]:
ratings_pivot = users_rated_all.pivot(index='ISBN', columns='userID').bookRating
userID = ratings_pivot.columns
ISBN = ratings_pivot.index
print(ratings_pivot.shape)
ratings_pivot = ratings_pivot.iloc[: , 0:10]
ratings_pivot

(15, 16)


userID,0312195516,0312966970,0312976275,0312980140,0312990456,0316769487,0375727345,0440213525,0440214041,0446605484
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7346,,8.0,7.0,8.0,,,,,,
16795,9.0,8.0,,6.0,,8.0,9.0,5.0,6.0,
31826,,,,,,,,,,
43246,,10.0,,5.0,,,8.0,,,
60244,7.0,9.0,,,9.0,7.0,7.0,,,
78973,,,,,,,,,,8.0
95359,7.0,,,,,8.0,7.0,,,7.0
110934,,9.0,9.0,9.0,9.0,,,,,
126492,,,,,,,,6.0,8.0,
153662,,,,,,,5.0,10.0,9.0,


Next find the correleation between two users based on the books they have rated using Pearson correlation. If two users give the exact same ratings, the correlation is 1. If the users give opposite ratings, the correlation is -1.

In [438]:
def find_correlation_between_two_users(ratings_df: pd.DataFrame, user1: str, user2: str):
    rated_books_by_both_users = ratings_df[[user1, user2]].dropna(axis=0).values
    user_1_ratings = rated_books_by_both_users[:, 0]
    user_2_ratings = rated_books_by_both_users[:, 1]
    return np.corrcoef(user_1_ratings, user_2_ratings)[0, 1]

Using these correlations, build the correlations matrix.

In [439]:
users = list(ratings_pivot.columns)
books = list(ratings_pivot.index)
similarity_matrix = np.array([[find_correlation_between_two_users(ratings_pivot, user1, user2) for user1 in users] for user2 in users])
similarity_df = pd.DataFrame(similarity_matrix, columns=users, index=users)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= s

In [440]:
similarity_df

Unnamed: 0,0312195516,0312966970,0312976275,0312980140,0312990456,0316769487,0375727345,0440213525,0440214041,0446605484
312195516,1.0,-0.866025,,,-0.866025,0.773021,0.300123,-1.0,1.0,0.316228
312966970,-0.866025,1.0,0.894427,-0.201737,,-1.0,-0.5,1.0,1.0,
312976275,,0.894427,1.0,0.981981,1.0,1.0,,1.0,1.0,-1.0
312980140,,-0.201737,0.981981,1.0,1.0,,1.0,1.0,1.0,
312990456,-0.866025,,1.0,1.0,1.0,,,1.0,1.0,0.188982
316769487,0.773021,-1.0,1.0,,,1.0,-0.049029,1.0,-1.0,0.5
375727345,0.300123,-0.5,,1.0,,-0.049029,1.0,-0.922613,-0.884615,0.866025
440213525,-1.0,1.0,1.0,1.0,1.0,1.0,-0.922613,1.0,0.469574,-1.0
440214041,1.0,1.0,1.0,1.0,1.0,-1.0,-0.884615,0.469574,1.0,-1.0
446605484,0.316228,,-1.0,,0.188982,0.5,0.866025,-1.0,-1.0,1.0


Next, define some methods that help with predicting the rating a user gives a certain book.

In [441]:
def get_rated_user_for_a_book(ratings_df: pd.DataFrame, book: str):
    return ratings_df.loc[book, :].dropna().index.values


def get_top_neighbors(
    similarity_df: pd.DataFrame, user: str, rated_users: str, n_neighbors: int
):
    return similarity_df[user][rated_users].nlargest(n_neighbors).to_dict()


def subtract_bias(rating: float, mean_rating: float):
    return rating - mean_rating

# To eleminate bias, subtract rating of the user from the mean rating of the user
def get_neighbor_rating_without_bias_per_book(
    ratings_df: pd.DataFrame, user: str, book: str
):
    mean_rating = ratings_df[user].mean()
    rating = ratings_df.loc[book, user]
    return subtract_bias(rating, mean_rating)


def get_ratings_of_neighbors(ratings_df: pd.DataFrame, neighbors: list, book: str):
    return [
        get_neighbor_rating_without_bias_per_book(ratings_df, neighbor, book)
        for neighbor in neighbors
    ]

def get_weighted_average_rating_of_neighbors(ratings: list, neighbor_distance: list):
    weighted_sum = np.array(ratings).dot(np.array(neighbor_distance))
    abs_neigbor_distance = np.abs(neighbor_distance)
    return weighted_sum / np.sum(abs_neigbor_distance)


def ger_user_rating(ratings_df: pd.DataFrame, user: str, avg_neighbor_rating: float):
    user_avg_rating = ratings_df[user].mean()
    return round(user_avg_rating + avg_neighbor_rating, 2)

Now, predict the rating of the user for a certain book based on the rating of the neighbors, using the methods defined above.

In [442]:
def predict_rating(
    df: pd.DataFrame,
    similarity_df: pd.DataFrame,
    user: str,
    book: str,
    n_neighbors: int = 2,
):

    ratings_df = df.copy()

    rated_users = get_rated_user_for_a_book(ratings_df, book)

    top_neighbors_distance = get_top_neighbors(
        similarity_df, user, rated_users, n_neighbors
    )
    neighbors, distance = top_neighbors_distance.keys(), top_neighbors_distance.values()

    ratings = get_ratings_of_neighbors(ratings_df, neighbors, book)
    avg_neighbor_rating = get_weighted_average_rating_of_neighbors(
        ratings, list(distance)
        
    )

    return ger_user_rating(ratings_df, user, avg_neighbor_rating)

In [443]:
full_ratings = ratings_pivot.copy()

for user, books in full_ratings.iteritems():
    for book in books.keys():
        if np.isnan(full_ratings.loc[book, user]):
            final_rating = predict_rating(
                ratings_pivot, similarity_df, user, book
            )
            if final_rating > 10:
                final_rating = 10
            full_ratings.loc[book, user] = final_rating

  return weighted_sum / np.sum(abs_neigbor_distance)


In [444]:
full_ratings

userID,0312195516,0312966970,0312976275,0312980140,0312990456,0316769487,0375727345,0440213525,0440214041,0446605484
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7346,9.0,8.0,7.0,8.0,8.2,7.89,7.79,6.44,6.04,9.6
16795,9.0,8.0,7.83,6.0,7.4,8.0,9.0,5.0,6.0,8.96
31826,,,,,,,,,,
43246,7.51,10.0,7.85,5.0,6.2,7.07,8.0,6.94,6.54,8.71
60244,7.0,9.0,8.03,7.56,9.0,7.0,7.0,7.08,6.66,7.33
78973,8.29,,8.6,,8.8,8.33,7.29,7.6,7.2,8.0
95359,7.0,9.63,8.93,7.31,9.68,8.0,7.0,7.56,6.65,7.0
110934,8.04,9.0,9.0,9.0,9.0,8.39,8.12,7.94,7.5,7.7
126492,9.49,8.31,8.2,7.2,8.4,7.13,7.71,6.0,8.0,8.4
153662,9.14,10.0,10.0,7.66,10.0,10.0,5.0,10.0,9.0,5.65


Almost the complete table is filled out. If there are missing values, this is because the data is too sparse. Now we can calculate the mean, min and max. To give a recommendation we can order them based on the average rating.

In [445]:
full_ratings['mean'] = full_ratings.mean(axis=1)
full_ratings['max'] = full_ratings.max(axis=1)
full_ratings['min'] = full_ratings.min(axis=1)
full_ratings.sort_values('mean', axis=0, ascending=False)

userID,0312195516,0312966970,0312976275,0312980140,0312990456,0316769487,0375727345,0440213525,0440214041,0446605484,mean,max,min
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
258534,8.89,9.0,10.0,10.0,10.0,9.73,8.57,9.0,8.0,8.0,9.119,10.0,8.0
153662,9.14,10.0,10.0,7.66,10.0,10.0,5.0,10.0,9.0,5.65,8.645,10.0,5.0
204864,10.0,7.73,9.43,7.31,7.39,10.0,7.0,8.6,8.24,8.0,8.37,10.0,7.0
110934,8.04,9.0,9.0,9.0,9.0,8.39,8.12,7.94,7.5,7.7,8.369,9.0,7.5
240567,8.0,7.81,9.0,8.06,9.0,9.0,8.0,8.0,5.0,9.0,8.087,9.0,5.0
78973,8.29,,8.6,,8.8,8.33,7.29,7.6,7.2,8.0,8.01375,8.8,7.2
126492,9.49,8.31,8.2,7.2,8.4,7.13,7.71,6.0,8.0,8.4,7.884,9.49,6.0
95359,7.0,9.63,8.93,7.31,9.68,8.0,7.0,7.56,6.65,7.0,7.876,9.68,6.65
7346,9.0,8.0,7.0,8.0,8.2,7.89,7.79,6.44,6.04,9.6,7.796,9.6,6.04
200226,8.0,8.0,8.0,7.22,8.61,8.0,7.09,7.08,6.76,7.69,7.645,8.61,6.76


For the least misery method, the books are ordered based on the minimum rating.

In [446]:
full_ratings.sort_values('min', axis=0, ascending=False)

userID,0312195516,0312966970,0312976275,0312980140,0312990456,0316769487,0375727345,0440213525,0440214041,0446605484,mean,max,min
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
258534,8.89,9.0,10.0,10.0,10.0,9.73,8.57,9.0,8.0,8.0,9.119,10.0,8.0
110934,8.04,9.0,9.0,9.0,9.0,8.39,8.12,7.94,7.5,7.7,8.369,9.0,7.5
78973,8.29,,8.6,,8.8,8.33,7.29,7.6,7.2,8.0,8.01375,8.8,7.2
204864,10.0,7.73,9.43,7.31,7.39,10.0,7.0,8.6,8.24,8.0,8.37,10.0,7.0
200226,8.0,8.0,8.0,7.22,8.61,8.0,7.09,7.08,6.76,7.69,7.645,8.61,6.76
60244,7.0,9.0,8.03,7.56,9.0,7.0,7.0,7.08,6.66,7.33,7.566,9.0,6.66
95359,7.0,9.63,8.93,7.31,9.68,8.0,7.0,7.56,6.65,7.0,7.876,9.68,6.65
7346,9.0,8.0,7.0,8.0,8.2,7.89,7.79,6.44,6.04,9.6,7.796,9.6,6.04
126492,9.49,8.31,8.2,7.2,8.4,7.13,7.71,6.0,8.0,8.4,7.884,9.49,6.0
261829,9.0,8.0,7.7,5.8,7.0,8.77,7.47,6.7,6.66,8.0,7.51,9.0,5.8


The explanations can be derived from the complete table. The users are anonymous so everyone can see how a certain user rated all the books. By being able to see the mean, maximum and minimum rating of all the books from all users it is clear why a certain recommendation was given.

## These are the things I tried to get the title of the book

In [400]:
#full_ratings['ISBN'] = full_ratings.index
#full_ratings

In [409]:
#books_df['ISBN']=books_df['ISBN'].astype(int64)
#combine_book_rating = pd.merge(full_ratings, books_df, on='ISBN')

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [412]:
#final = pd.merge(full_ratings, books_df[['ISBN','bookTitle']], left_on='ISBN', right_on='ISBN', how='left')

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [358]:
#full_ratings['bookTitle'] = full_ratings['ISBN'].map(books_df.set_index('ISBN')['bookTitle'])