In [1]:
import pandas as  pd
import matplotlib.pyplot as plt
from lenskit.algorithms import Recommender
from lenskit.algorithms.item_knn import ItemItem
from scipy.sparse import csr_matrix
from lenskit.algorithms.user_knn import UserUser
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, roc_curve, auc, mean_squared_error, mean_absolute_error

# Helper methods

In [2]:
def plot_roc_curve(roc_auc, fpr, tpr):
    plt.figure()
    lw = 2
    plt.plot(
        fpr,
        tpr,
        color="darkorange",
        lw=lw,
        label="ROC curve (area = %0.2f)" % roc_auc,
    )
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver operating characteristic example")
    plt.legend(loc="lower right")
    plt.show()

In [3]:
def item_explanation(recommended_books):
    print(list(recommended_books['book_title']))
    print("Based on your previous ratings we found books that similar to books which you have already rated.")
    if input("Do you want detailed description of your recommendation? y/n") == 'y':
        print("The algorithm computed predicted scores to demonstrate how the book fits with your preferences.")
        print("Above illustrated books and their predicted scores")
        print(recommended_books.head(10))

# Read & Analyze Data

In [4]:
#Read datasets 
users = pd.read_csv('BX-Users.csv', sep = ';', encoding='cp1252')
books = pd.read_csv('BX-Books.csv', sep=';', encoding='cp1252', on_bad_lines='skip', low_memory=False)
book_ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding='cp1252')

In [5]:
# Shapes of our datasets
print("Books Shape: ", books.shape )
print("Book ratings Shape: ", book_ratings.shape )
print("Users Shape: ", users.shape )

Books Shape:  (271360, 8)
Book ratings Shape:  (1149780, 3)
Users Shape:  (278858, 3)


In [6]:
# Calculating nulls in our datasets
print("Books null information: \n", books.isnull().sum())
print("Book ratings null information: \n", book_ratings.isnull().sum())
print("Users null information: \n", users.isnull().sum())

Books null information: 
 ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64
Book ratings null information: 
 User-ID        0
ISBN           0
Book-Rating    0
dtype: int64
Users null information: 
 User-ID          0
Location         0
Age         110762
dtype: int64


# Prepocessing

In [7]:
# Convert all names of columns to lower case
users.columns = users.columns.str.strip().str.lower().str.replace('-', '_')
book_ratings.columns = book_ratings.columns.str.strip().str.lower().str.replace('-', '_')
books.columns = books.columns.str.strip().str.lower().str.replace('-', '_')

In [8]:
# Merge books and book ratings datasets together by ISBN
books_dataset = books.merge(book_ratings, on="isbn")
# Drop nulls
books_dataset = books_dataset.dropna(subset = ['book_author', 'publisher'])
# Drop small and large images of books
books_dataset = books_dataset.drop(columns=['image_url_s', 'image_url_l', 'image_url_m'], axis=1)
# Reset indices
books_dataset = books_dataset.reset_index(drop=True)
print(books_dataset.isnull().sum())
print("Books dataset shape: ", books_dataset.shape)
print("Unique books in datasets: ", len(books_dataset.isbn.unique()))

isbn                   0
book_title             0
book_author            0
year_of_publication    0
publisher              0
user_id                0
book_rating            0
dtype: int64
Books dataset shape:  (1031133, 7)
Unique books in datasets:  270148


In [9]:
def preprocess_date(data, min_rating=10):
    data_copy = data.copy()
    # Calculate how many books with 0 ratings we have
    zero_ratings = data_copy[data_copy['book_rating'] == 0]
    # print("Total number of books with zero ratings: ", zero_ratings.shape)
    # Remove books with zero ratings
    data_copy = data_copy.drop(index=zero_ratings.index)
    data_copy = data_copy.reset_index(drop=True)
    # print("Unique books in datasets: ", len(books_dataset.isbn.unique()))
    # print("Books dataset shape after dropping 0 ratings: ", books_dataset.shape)

    # Calculate how many times users rated books
    # users_ratings_count = pd.DataFrame(books_dataset['user_id'].value_counts())
    #
    # urc_indices = users_ratings_count[users_ratings_count['user_id'] >= 10].index
    # print("Users which rated at least 10 books = ", len(urc_indices.values))
    # books_dataset = books_dataset[books_dataset['user_id'].isin(urc_indices)]
    # books_dataset = books_dataset.reset_index(drop=True)


    # Calculate how many times books were rated
    books_ratings_count = pd.DataFrame(data_copy['isbn'].value_counts())

    brc_indices = books_ratings_count[books_ratings_count['isbn'] >= min_rating].index
    # print("Books which were rated at least 50 times = ",len(brc_indices.values))
    data_copy = data_copy[data_copy['isbn'].isin(brc_indices)]
    data_copy = data_copy.reset_index(drop=True)
    data_copy = data_copy.rename(columns={'isbn':'item', 'user_id':'user', 'book_rating':'rating'})
    return data_copy

In [10]:
books = books.rename(columns={'isbn':'item'})

# print("Books dataset shape after dropping books with less than 50 ratings: ", books_dataset.shape)

# books_dataset.nunique()

In [11]:
books_dataset.head(3)

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,book_rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,0


In [12]:
# Converting our dataframe to scipy sparse matrix
# books_features = books_dataset.pivot(
#     index='item',
#     columns='user',
#     values='rating'
# ).fillna(0)
#
# mat_books_features = csr_matrix(books_features.values)

In [13]:
# books_features.head(10)

# Item-Based Collaborative Filtering

## Model Evaluation

In [14]:
# Testing information dataset
test_data = pd.DataFrame(columns=["unique_books", "unique_users", "min_books_rating", "min_neighbours", "max_neighbours", "test_size", "precision", "recall", "fscore", "accuracy", "confussion_matrix", "rmse", "mae", "fpr", "tpr", "roc_auc"])

In [15]:
def evaluate_model_item_based_individual(data, max_nbrs=30, min_nbrs=1, test_size=0.2):
    train_df, test_df = train_test_split(data, test_size=test_size)

    item_item = ItemItem(max_nbrs, min_nbrs=min_nbrs)
    recsys = Recommender.adapt(item_item)
    recsys.fit(train_df)

    test_df['predicted_rating'] = recsys.predict(test_df)

    test_df['relevant'] = test_df['rating'].apply(lambda x: 1 if x>3 else 0)
    test_df['predicted_relevant'] = test_df['predicted_rating'].apply(lambda x: 1 if x>3 else 0)

    y_test = list(test_df['relevant'])
    y_pred = list(test_df['predicted_relevant'])
    # precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")
    # fpr, tpr, _ = roc_curve(y_test, y_pred)
    # roc_auc = auc(fpr, tpr)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    return rmse, mae


In [16]:
testing = False
if testing:
    min_rating = [10, 15, 20, 25, 30, 35]
    min_nbrs = [1]
    max_nbrs = [12]
    test_size = [0.2]
    for mr in min_rating:
        for min_n in min_nbrs:
            for max_n in max_nbrs:
                if min_n > max_n:
                        continue
                for ts in test_size:
                    data_tmp = preprocess_date(books_dataset, min_rating=mr)
                    rmse, mae = evaluate_model_item_based_individual(data_tmp, max_nbrs=max_n, min_nbrs=min_n, test_size=ts)

                    print(str(mr) + " " + str(min_n) + " " + str(max_n) + " " +  str(ts))
                    new_row = {"unique_books": len(data_tmp.item.unique()), "unique_users": len(data_tmp.user.unique()), "min_books_rating": mr, "min_neighbours": min_n, "max_neighbours": max_n, "test_size": ts, "rmse": rmse, "mae": mae}

                    test_data = test_data.append(new_row , ignore_index=True)

In [17]:
# test_data = test_data.sort_values(by='rmse', ascending=True).reset_index(drop=True)


In [18]:
# compression_opts = dict(method='zip',
#                         archive_name='out.csv')
# test_data.to_csv('out.zip', index=False,
#           compression=compression_opts)

## Recommendation

In [19]:
def item_based(data, selected_user, num_recs=10, max_nbrs=12, min_nbrs=1):
    # We use the collaborative item algorithm ItemItem, that use the nearest neighbors
    item_item = ItemItem(max_nbrs, min_nbrs=min_nbrs)  # Minimum (3) and maximum (15) number of neighbors to consider
    recsys = Recommender.adapt(item_item)
    recsys.fit(data)
    recommended_books = recsys.recommend(selected_user, num_recs) # generate 10 recommendations for the selected user
    selected_books_item_item = pd.merge(data, recommended_books, on='item', how = 'inner')
    selected_books_item_item = selected_books_item_item.drop_duplicates(subset='item')
    selected_books_item_item = selected_books_item_item.drop(columns=['rating'], axis=1)
    selected_books_item_item = selected_books_item_item.reset_index(drop=True)
    item_explanation(selected_books_item_item)
    return selected_books_item_item

In [20]:
data_individual = preprocess_date(books_dataset, min_rating=25)
recommendations = item_based(data_individual, int(data_individual.sample(n=1)['user']))

could not load LIBBLAS: Could not find module 'libblas' (or one of its dependencies). Try using the full path with constructor syntax.
Numba is using threading layer omp - consider TBB
found 1 potential runtime problems - see https://boi.st/lkpy-perf


['The Giver (Readers Circle)', 'Love You Forever', 'The Godfather', 'The Lord of the Rings (Movie Art Cover)', 'The Two Towers (The Lord of the Rings, Part 2)', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', 'Illusions', "Sabine's Notebook: In Which the Extraordinary Correspondence of Griffin &amp; Sabine Continues", 'Patriot Games (Jack Ryan Novels)', 'Vector']
Based on your previous ratings we found books that similar to books which you have already rated.
The algorithm computed predicted scores to demonstrate how the book fits with your preferences.
Above illustrated books and their predicted scores
         item                                         book_title  \
0  0440237688                         The Giver (Readers Circle)   
1  0920668372                                   Love You Forever   
2  0451167716                                      The Godfather   
3  0618129022            The Lord of the Rings (Movie Art Cover)   
4  0618002235     The Two Towers (T

# Aggregation Strategies

In [42]:
# Approval voting strategy (Majority Based)
def approval_voting_stg(recommended_books, threshold):
    # Drop all books below threshold
    recommended_books = recommended_books[recommended_books['predicted_rating'] > threshold]
    # Calculate votes to each book
    recommended_books = pd.DataFrame(recommended_books['item'].value_counts())
    # Sort in descending order (max to min)
    recommended_books = recommended_books.sort_values(by='item', ascending=False).reset_index(drop=True)
    return recommended_books

# Average strategy (Consensus based)
def average_stg(recommended_books, group_size):
    # Calculate average scores for each book
    recommended_books = recommended_books.groupby(['item'])['predicted_rating'].sum() / group_size

    # Sort in descending order (max to min)
    # recommended_books = recommended_books.sort_values(by='predicted_rating', ascending=False).reset_index(drop=True)
    return recommended_books

# Most pleasure strategy (Borderline)
def most_pleasure_stg(recommended_books):
    # Find the maximum score for each book
    recommended_books = recommended_books.groupby(['item']).max().reset_index(drop=True)
    # Sort in descending order (max to min)
    recommended_books = recommended_books.sort_values(by='predicted_rating', ascending=False).reset_index(drop=True)
    return recommended_books

# Group Recommendation

In [43]:
def item_based_group(data, group_size=10, agg_strategy=2):
    # Select random sample
    random_selected = list(data.sample(n=group_size).reset_index(drop=True)['user'])

    # Select remained users
    remained_users = list(data.user.unique())
    for i in random_selected:
        remained_users.remove(i)

    # Create unseen books with random user for prediction
    unseen_books = data[data['user'].isin(remained_users)]
    unseen_books = unseen_books.drop(columns=['user', 'rating'], axis=1)
    unseen_books = unseen_books.drop_duplicates(subset='item')
    unseen_books_group = pd.DataFrame()
    for i in random_selected:
        unseen_books['user'] = i
        unseen_books_group = unseen_books_group.append(unseen_books)
    unseen_books_group = unseen_books_group.reset_index(drop=True)

    # Item Item CF
    item_item = ItemItem(10, min_nbrs=1)
    recsys = Recommender.adapt(item_item)
    recsys.fit(data)

    unseen_books_group['predicted_rating'] = recsys.predict(unseen_books_group)

    # Drop nulls
    group_unseen_books = unseen_books_group.dropna(subset = ['predicted_rating'])

    # Drop books with null in matrix
    group_unseen_books_count = pd.DataFrame(group_unseen_books['item'].value_counts())

    gun_indices = group_unseen_books_count[group_unseen_books_count['item'] == group_size].index

    group_unseen_books = group_unseen_books[group_unseen_books['item'].isin(gun_indices)]

    # # Converting our dataframe to scipy sparse matrix
    # books_features = group_unseen_books.pivot(
    #     index='item',
    #     columns='user',
    #     values='predicted_rating'
    # ).fillna(0)

    if agg_strategy == 1:
        recommendation = approval_voting_stg(group_unseen_books, 6)
    elif agg_strategy == 2:
        recommendation = average_stg(group_unseen_books, group_size)
    else:
        recommendation = most_pleasure_stg(group_unseen_books)

    return recommendation

In [44]:
data_group = preprocess_date(books_dataset, min_rating=25)

group_recommendation = item_based_group(data_group)
group_recommendation.head(10)

  unseen_books_group = unseen_books_group.append(unseen_books)
  unseen_books_group = unseen_books_group.append(unseen_books)
  unseen_books_group = unseen_books_group.append(unseen_books)
  unseen_books_group = unseen_books_group.append(unseen_books)
  unseen_books_group = unseen_books_group.append(unseen_books)
  unseen_books_group = unseen_books_group.append(unseen_books)
  unseen_books_group = unseen_books_group.append(unseen_books)
  unseen_books_group = unseen_books_group.append(unseen_books)
  unseen_books_group = unseen_books_group.append(unseen_books)
  unseen_books_group = unseen_books_group.append(unseen_books)


item
006000438X    6.979271
0060929871    7.185851
0060930535    7.637784
0060934417    7.795019
0060959037    7.934099
0060977493    7.156123
014023313X    6.891096
0312201656    7.254784
0312278586    6.868768
0312924585    8.063210
Name: predicted_rating, dtype: float64