## Importing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lenskit import batch, topn, util
from lenskit.algorithms.user_knn import UserUser
from lenskit.algorithms import Recommender
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, mean_squared_error, mean_absolute_error
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor

## Functions

In [2]:
def data_wrangling_fix(dataset = pd.DataFrame([]), key=None, column=None, key_value = None, new_column_value = None):
    dataset = dataset.copy()
    
    if key is None or column is None or key_value is None or new_column_value is None:
        return
    
    dataset.loc[dataset[key] == key_value, column] = new_column_value

    return dataset

In [3]:
def book_ratings_grouped_by(dataset = pd.DataFrame([]), key_to_group = None, rating_column = 'Book-Rating'):
    if key_to_group is None:
        return

    book_ratings_grouped_by_key = dataset.groupby(key_to_group)[rating_column].count().sort_values(ascending=False)
    book_ratings_grouped_by_key = pd.DataFrame(book_ratings_grouped_by_key)
    book_ratings_grouped_by_key.rename(columns={'Book-Rating': 'Ratings-Count'}, inplace=True)
    book_ratings_grouped_by_key = book_ratings_grouped_by_key.reset_index()

    return book_ratings_grouped_by_key

In [4]:
def group_and_merge_ratings(ratings = pd.DataFrame([]), items = pd.DataFrame([]), key_to_group = None):
    if key_to_group is None:
        return

    book_ratings_grouped_by_key = book_ratings_grouped_by(ratings, key_to_group)
    item_differences = items[items[key_to_group].isin(book_ratings_grouped_by_key[key_to_group])]
    book_ratings_merged = pd.merge(item_differences[key_to_group], ratings, on=key_to_group, how='inner')

    return book_ratings_merged

In [5]:
def filter_items_of_book_ratings_by_threshold(dataset = pd.DataFrame([]), key=None, threshold=20, custom_merge=None):
    items_grouped = book_ratings_grouped_by(dataset, key_to_group=key)
    items_min_ratings = items_grouped[items_grouped['Ratings-Count'] > threshold]

    dataset_to_merge = dataset

    if custom_merge is not None:
        dataset_to_merge = custom_merge

    return pd.merge(items_min_ratings[key], dataset_to_merge, on=key), items_min_ratings

## Reading

In [6]:
users = pd.read_csv('BX-Users.csv', sep=';', encoding='cp1252', on_bad_lines="skip", low_memory=False)
books = pd.read_csv('BX-Books.csv', sep=";", encoding='cp1252', on_bad_lines="skip", low_memory=False)
book_ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding='cp1252', on_bad_lines="skip", low_memory=False)

### Solve Data Wrangling

In [7]:
books = data_wrangling_fix(books, 'ISBN', 'Year-Of-Publication', '078946697X', 2000)
books = data_wrangling_fix(books, 'ISBN', 'Book-Author', '078946697X', 'Michael Teitelbaum')
books = data_wrangling_fix(books, 'ISBN', 'Publisher', '078946697X', 'DK Publishing Inc')
books = data_wrangling_fix(books, 'ISBN', 'Book-Title', '078946697X', 'K Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)')

books = data_wrangling_fix(books, 'ISBN', 'Year-Of-Publication', '0789466953', 2000)
books = data_wrangling_fix(books, 'ISBN', 'Book-Author', '0789466953', 'James Buckley')
books = data_wrangling_fix(books, 'ISBN', 'Publisher', '0789466953', 'DK Publishing Inc')
books = data_wrangling_fix(books, 'ISBN', 'Book-Title', '0789466953', 'K Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)')

books = data_wrangling_fix(books, 'ISBN', 'Year-Of-Publication', '2070426769', 2003)
books = data_wrangling_fix(books, 'ISBN', 'Book-Author', '2070426769', 'jean-marie gustave le clézio')
books = data_wrangling_fix(books, 'ISBN', 'Publisher', '2070426769', 'Gallimard')
books = data_wrangling_fix(books, 'ISBN', 'Book-Title', '2070426769', 'Peuple du ciel, suivi de "Les Bergers"')

In [8]:
books.drop(columns=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], inplace=True)
books = books.astype({'ISBN': 'string', 'Book-Title': 'string', 'Book-Author': 'string', 'Publisher': 'string', 'Year-Of-Publication': np.uint32})

In [9]:
users = users.astype({'User-ID': 'Int64', 'Location': 'string', 'Age': 'Int64'})

## Filter Data

In [10]:
book_ratings_books_merged = group_and_merge_ratings(book_ratings, books, 'ISBN')

In [11]:
book_ratings_users_merged = group_and_merge_ratings(book_ratings, users, 'User-ID')

In [12]:
book_ratings_nonzero = book_ratings_books_merged[book_ratings_books_merged['Book-Rating'] > 0]

In [13]:
book_ratings_filtered, books_grouped_book_ratings_min_rating_count = filter_items_of_book_ratings_by_threshold(book_ratings_nonzero, 'ISBN', 20)

In [14]:
book_ratings_filtered, users_grouped_book_ratings_min_rating_count = filter_items_of_book_ratings_by_threshold(book_ratings_filtered, 'User-ID', 20, custom_merge=book_ratings_filtered)

## User-Item Matrix

In [15]:
book_rating_user_item_matrix = pd.DataFrame(book_ratings_filtered.copy())
book_rating_user_item_matrix.rename(columns={'User-ID': 'user', 'ISBN': 'item', 'Book-Rating': 'rating'}, inplace=True)

In [16]:
def build_user_based_cf(dataset, max_neighbors=30, min_neighbors=1):
    user_user = UserUser(nnbrs=max_neighbors, min_nbrs=min_neighbors)
    recsys = Recommender.adapt(user_user)
    recsys.fit(dataset)

    return recsys

In [17]:
def evaluate(matrix, max_neighbors=30, min_neighbors=1, test_size=0.2, seed = None):
    training_set, test_set = train_test_split(matrix, test_size=test_size, random_state=seed)

    recsys = build_user_based_cf(training_set, min_neighbors=min_neighbors, max_neighbors=max_neighbors)

    test_set['predicted_rating'] = recsys.predict(test_set)

    test_set['relevant'] = test_set['rating'].apply(lambda x: 1 if x > 3 else 0)
    test_set['predicted_relevant'] = test_set['predicted_rating'].apply(lambda x: 1 if x > 3 else 0)

    y_test = pd.Series(test_set['relevant'])
    y_pred = pd.Series(test_set['predicted_relevant'])

    precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)

    return precision, recall, fscore, rmse, mae

In [18]:
enable_experiments = False

if enable_experiments:
    # Ensure the split is always the same for reproducability purposes
    random_seed = 42

    exp_book_ratings_threshold = [5, 10, 15, 20, 25, 30, 50, 75]
    exp_user_ratings_threshold = [5, 10, 15, 20, 25, 30, 50, 75]
    exp_min_neighbors_values = [1]
    exp_max_neighbors_values = [20]
    exp_test_size = [0.2]

    exp_results = pd.DataFrame([], columns=['Book Ratings Threshold', 'User Ratings Threshold', 'Min Neighbors', 'Max Neighbors', 'Train-Test Split', 'Book Count', 'User Count', 'Rating Count', 'Precision', 'Recall', 'FScore', 'MRSE', 'MAE'])

    for brt in exp_book_ratings_threshold:
        for urt in exp_user_ratings_threshold:
            for minnb in exp_min_neighbors_values:
                for maxnb in exp_max_neighbors_values:
                    if minnb > maxnb:
                        continue

                    for ts in exp_test_size:
                        exp_dataset, exp_books_min_rating = filter_items_of_book_ratings_by_threshold(book_ratings_nonzero, 'ISBN', brt)
                        exp_dataset, exp_users_min_rating = filter_items_of_book_ratings_by_threshold(exp_dataset, 'User-ID', urt)

                        exp_dataset.rename(columns={'User-ID': 'user', 'ISBN': 'item', 'Book-Rating': 'rating'}, inplace=True)

                        exp_precision, exp_recall, exp_fscore, exp_rmse, exp_mae = evaluate(exp_dataset, max_neighbors=maxnb, min_neighbors=minnb, test_size=ts, seed=random_seed)
                        exp_results = pd.concat([exp_results, pd.DataFrame([{
                            'Book Ratings Threshold': brt,
                            'User Ratings Threshold': urt,
                            'Min Neighbors': minnb,
                            'Max Neighbors': maxnb,
                            'Train-Test Split': ts,
                            'Book Count': exp_dataset['item'].nunique(),
                            'User Count': exp_dataset['user'].nunique(),
                            'Rating Count': exp_dataset.size,
                            'Precision': exp_precision,
                            'Recall': exp_recall,
                            'FScore': exp_fscore,
                            'MRSE': exp_rmse,
                            'MAE': exp_mae
                        }])])

                        print(f"Finished {brt}-{urt}-{minnb}-{maxnb}-{ts}")

In [19]:
#exp_results

## Content-Boosting

In [20]:
def predict(user):
    selected_user_ratings = book_ratings_CF.loc[book_ratings_CF['user'] == user]
    number_ratings = len(selected_user_ratings)
    
    selected_user_rated_books = selected_user_ratings['item']
    rated = books.loc[books['ISBN'].isin(selected_user_rated_books)]
    rated_books = pd.DataFrame(rated)
    rated_books.rename(columns = {'ISBN':'item'}, inplace = True)
    rated_books_df = pd.merge(rated_books, selected_user_ratings, on = 'item', how = 'inner')
    
    selected_user_unrated_books = book_ratings_CF.loc[~book_ratings_CF['item'].isin(selected_user_rated_books)]
    selected_user_unrated_books = selected_user_unrated_books['item']
    unrated =  books.loc[books['ISBN'].isin(selected_user_unrated_books)]
    unrated_books = pd.DataFrame(unrated)
    unrated_books.rename(columns = {'ISBN':'item'}, inplace = True)
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(rated_books_df['Book-Title'])
    y = rated_books_df['rating']
    
    neighbors = 2
    
    if (number_ratings < neighbors):
        neighbors = number_ratings
    neigh = KNeighborsRegressor(n_neighbors = neighbors)
    neigh.fit(X, y)
    
    X_unrated = vectorizer.transform(unrated_books['Book-Title'])
    y_unrated = neigh.predict(X_unrated)

    unrated_books['predicted'] = y_unrated
    unrated_books = unrated_books[['item', 'predicted']].copy()
    return unrated_books

In [21]:
def fill_missing_ratings(user, matrix, predictions):
    for i in predictions['item']:
        if pd.isnull(matrix.loc[user, str(i)]):
            matrix.loc[user, str(i)] = predictions.loc[predictions['item'] == i]['predicted'].item()
    
    return matrix

In [22]:
def recompute_matrix(matrix):
    for u in matrix.index:
        predicted = predict(u)
        matrix = fill_missing_ratings(u, matrix, predicted)
    
    return matrix

In [23]:
book_ratings_CF = pd.DataFrame(book_ratings_filtered)
book_ratings_CF.rename(columns = {'User-ID':'user' , 'ISBN' : 'item', 'Book-Rating' : 'rating'}, inplace = True)

In [24]:
matrix = book_ratings_CF.pivot(index='user', columns='item', values='rating')

In [25]:
content_matrix = matrix.copy()
content_matrix = recompute_matrix(content_matrix)

In [26]:
book_ratings_new = content_matrix.reset_index().melt(id_vars='user', var_name='item', value_name='rating')

In [27]:
enable_experiments = True

if enable_experiments:
    # Ensure the split is always the same for reproducability purposes
    random_seed = 42

    exp_book_ratings_threshold = [5, 10, 15, 20, 25, 30, 50, 75]
    exp_user_ratings_threshold = [5, 10, 15, 20, 25, 30, 50, 75]
    exp_min_neighbors_values = [1]
    exp_max_neighbors_values = [20]
    exp_test_size = [0.2]

    boosted_exp_results = pd.DataFrame([], columns=['Book Ratings Threshold', 'User Ratings Threshold', 'Min Neighbors', 'Max Neighbors', 'Train-Test Split', 'Book Count', 'User Count', 'Rating Count', 'Precision', 'Recall', 'FScore', 'MRSE', 'MAE'])

    for brt in exp_book_ratings_threshold:
        for urt in exp_user_ratings_threshold:
            for minnb in exp_min_neighbors_values:
                for maxnb in exp_max_neighbors_values:
                    if minnb > maxnb:
                        continue

                    for ts in exp_test_size:
                        exp_precision, exp_recall, exp_fscore, exp_rmse, exp_mae = evaluate(book_ratings_new, max_neighbors=maxnb, min_neighbors=minnb, test_size=ts, seed=random_seed)
                        boosted_exp_results = pd.concat([boosted_exp_results, pd.DataFrame([{
                            'Book Ratings Threshold': brt,
                            'User Ratings Threshold': urt,
                            'Min Neighbors': minnb,
                            'Max Neighbors': maxnb,
                            'Train-Test Split': ts,
                            'Book Count': book_ratings_new['item'].nunique(),
                            'User Count': book_ratings_new['user'].nunique(),
                            'Rating Count': book_ratings_new.size,
                            'Precision': exp_precision,
                            'Recall': exp_recall,
                            'FScore': exp_fscore,
                            'MRSE': exp_rmse,
                            'MAE': exp_mae
                        }])])

                        print(f"Finished {brt}-{urt}-{minnb}-{maxnb}-{ts}")

In [28]:
boosted_exp_results

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c748b8c6-a06d-415c-803a-0d3a975a7798' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>