# The goal is to find books similar to Lord of the rings
* Method: colaborative filtering using cosine similarity

In [1]:
#Importing libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from scipy import spatial
from numpy.linalg import norm

In [2]:
#importing data
book_df = pd.read_csv('../input/booksratings/BX-Books.csv', sep = ';', error_bad_lines=False, encoding="latin-1") 
ratings_df = pd.read_csv('../input/booksratings/BX-Book-Ratings.csv', sep = ';', error_bad_lines=False, encoding="latin-1")
user_df = pd.read_csv('../input/booksratings/BX-Users.csv', sep = ';', error_bad_lines=False, encoding="latin-1")

# A glimpse at dataframes
* Size
* Variables (columns)

In [3]:
print(book_df.shape)
book_df.head(2)

In [4]:
print(ratings_df.shape)
ratings_df['Book-Rating'].isnull().values.any()
ratings_df.head(2)

In [5]:
print(user_df.shape)
user_df.head(2)

# First toughts
   * **Cluster analysis** - divide books into clusters based on book's attributes, books in the same cluster as LotR will be the ones recommended
       * Availeble data doesn't provide much information about book's content -> webscraping more data? -> too complicated -> looking for different approach
   * **Collaborative filtering**
       * Seemed suitable for this task. Especialy item-2-item collaborative filtering
    

# Calculating mean rating and ratings count for each book
* explicit (1-10) x implicit ratings (0)
* mean rating "Book-Rating" is mean explicit rating. Equel to NaN if there are just implicit ratings
* num_of_ratings - count of explicit ratings
* num_of_all_ratings - count of both types of ratings

In [6]:
ratings = pd.DataFrame(ratings_df.groupby('ISBN')['Book-Rating'].count())
ratings['num_of_ratings'] = pd.DataFrame(ratings_df.replace(0,np.NaN).groupby('ISBN')['Book-Rating'].count())
ratings['Average-Rating'] = pd.DataFrame(ratings_df.replace(0,np.NaN).groupby('ISBN')['Book-Rating'].mean())
ratings = ratings.rename(columns={'Book-Rating':'num_of_all_ratings'})
print(ratings.shape)
print(pd.Series(ratings['num_of_ratings']==0).sum())
ratings.head(10)

In [7]:
#frequencies of rating counts
print(ratings['num_of_all_ratings'].value_counts().sort_index().head(5))
print(ratings['num_of_all_ratings'].value_counts().sort_index().tail(5))
print(ratings['num_of_ratings'].value_counts().sort_index().head(5))
print(ratings['num_of_ratings'].value_counts().sort_index().tail(5))

In [8]:
fig, ax = plt.subplots(2,1,figsize = (15,10))
ax[0].hist(ratings['num_of_all_ratings'], bins = ratings['num_of_all_ratings'].value_counts().sort_index().index[-1])
ax[0].set_xlim(0,50)
ax[0].set_xlabel('ratings count')
ax[0].set_ylabel('books count')
ax[0].set_title('num_of_all_ratings')
ax[1].hist(ratings['num_of_ratings'], bins = ratings['num_of_ratings'].value_counts().sort_index().index[-1])
ax[1].set_title('num_of_ratings')
ax[1].set_xlabel('ratings count')
ax[1].set_ylabel('books count')
ax[1].set_xlim(0,25)
plt.show()

In [9]:
fig, ax = plt.subplots()
ax.bar(ratings_df['Book-Rating'].value_counts().sort_index().index, ratings_df['Book-Rating'].value_counts().sort_index().values)
ax.set_title('Rating Distribution\n')
ax.set_xlabel('Rating')
ax.set_ylabel('Count')


# Findings
   * Most of the ratings are implicit ratings (=0), aprox. 700,000 out of 1,149,780 ratings
       * Exclusion of implicit ratings might lead to huge data loss. I should find way how to deal with implicit ratings
   * Majority of books have less than 10 implicit ratings and less than 5 explicit ratings
       * It will lead to sparsity of user-item matrix ('book_mat') later on

# LotR books
* Average ratings and ratings counts for Lotr books
* There are 119 LotR books
* Simplification: I will look for similar books to The Fellowship of the Ring (The Lord of the Rings), ISBN-0345339681
    * There are more than one publishments of The Fellowship of the Ring. I should have combine these in order to get more ratigs for this book. Unfortunately I noticed this issue too late
    * Or combine all LotR books for the same

In [10]:
books_merged = ratings.merge(book_df[['ISBN','Book-Title']], left_on = 'ISBN', right_on = 'ISBN').sort_values('num_of_ratings', ascending = False)
books_merged[books_merged['Book-Title'].str.contains("Lord of the Rings", case=False, regex=False)]

# Filtering ratings of users who rated LotR part 1
* Similarity will be calculated based on ratings of users who rated target book. Ratings of other users can be filtered out.
* 75,927 out of 1,149,780 ratings

In [11]:
ratings_df.head()

In [12]:
target='0345339703'
users_filter = ratings_df[ratings_df['ISBN'].isin([target])]
users_filter
ratings_df = ratings_df[ratings_df['User-ID'].isin(users_filter['User-ID'])]
ratings_df.shape

# Number of ratings recalculation
* To get the count of ratings that will be used for similaritu calculation

In [13]:
ratings = pd.DataFrame(ratings_df.groupby('ISBN')['Book-Rating'].count())
ratings['num_of_ratings'] = pd.DataFrame(ratings_df.replace(0,np.NaN).groupby('ISBN')['Book-Rating'].count())
ratings['Average-Rating'] = pd.DataFrame(ratings_df.replace(0,np.NaN).groupby('ISBN')['Book-Rating'].mean())
ratings = ratings.rename(columns={'Book-Rating':'num_of_all_ratings'})
print(ratings.shape)
print(pd.Series(ratings['num_of_ratings']==0).sum())
ratings.head(10)

# User - Book matrix
* rows - users; columns - books; values -ratings
* 257 users, 52,182 books

In [14]:
book_mat = ratings_df.pivot_table(index = 'User-ID', columns = 'ISBN', values = 'Book-Rating')
print(book_mat.shape)
book_mat.sample(5)

# Data preparation for similarity calculation:
implicit ratings will be considered as neutral (mean) ratings not bad ratings (=0)
* Calculation of mean rating for each user.
    * Missing values (NaNs) and implicit ratings (=0) are not included into means calculation
* replacement of implicit ratings by user's mean rating (zeros in each row are replaced by row mean).
* **Centering:** for each row the row mean is substracted from row elements (transformed implcit ratings are included in calculation of row means)
    * conservative vs liberal raters, see paragraph "Adjusted cosine similarity" in http://files.grouplens.org/papers/www10_sarwar.pdf


In [15]:
#row means calculation
users_means = [0] * len(book_mat.index)
for i in range(len(book_mat.index)):
    users_means[i] = book_mat.iloc[i, :][book_mat.iloc[i, :] !=0].mean(skipna=True)

In [16]:
#replacement of implicit ratings by row means
for i in range(len(users_means)):
    book_mat.iloc[i, :]=book_mat.iloc[i, :].replace(0, users_means[i])

In [17]:
#centering
book_mat = book_mat.sub(book_mat.mean(axis=1, skipna=True), axis=0)

In [18]:
book_mat.mean(axis=1, skipna=True)

# Most similar books to The Fellowship of the Ring (ISBN 0345339703)
* **Cosine similarity** will be used as similarity metrics
* cossine_sim function
    * function that calculates cosine similarity between 2 lists (ratings for 2 books)
    * it isoletes only users who rated both considered books

In [19]:
book_target_user_ratings = book_mat['0345339703']

In [20]:
def cosine_sim(df1, df2):

    df1na = df1.isna()
    df1clean = df1[~df1na]
    df2clean = df2[~df1na]

    df2na = df2clean.isna()
    df1clean = df1clean[~df2na]
    df2clean = df2clean[~df2na]
    
    sim = np.dot(df1clean, df2clean)/(norm(df1clean)*norm(df2clean))

    return sim

In [21]:
books_sim = [0] * len(book_mat.columns)
for i in range(len(book_mat.columns)):
        books_sim[i] = cosine_sim(book_target_user_ratings, book_mat.iloc[:, i])

In [24]:
coss_book_target = pd.DataFrame({'ISBN': book_mat.columns, 'cos_sim': books_sim})
coss_book_target.dropna(inplace = True)
coss_book_target = coss_book_target.merge(book_df[['ISBN','Book-Title']], left_on = 'ISBN', right_on = 'ISBN')
coss_book_target = coss_book_target.merge(ratings[['num_of_ratings','num_of_all_ratings', 'Average-Rating']], on = 'ISBN')
top_10_sim = coss_book_target[coss_book_target['num_of_ratings']>15].sort_values(['cos_sim'], ascending=[False]).head(10)
top_10_sim

In [23]:
fig, axs = plt.subplots(3, 3, figsize = (20,20))

axs_x = [0,1,2,0,1,2,0,1,2]
axs_y = [0,0,0,1,1,1,2,2,2]
ind = range(1,10,1)

for ax_x, ax_y, ind in zip(axs_x, axs_y, ind):
    axs[ax_y,ax_x].scatter(book_mat['0345339703'], book_mat[top_10_sim['ISBN'].iloc[ind]], s=100, facecolor='C0', edgecolor='k')
    axs[ax_y,ax_x].set_ylabel(top_10_sim['Book-Title'].iloc[ind])
    axs[ax_y,ax_x].set_title('User ratings')
    axs[ax_y,ax_x].set_xlabel(list(coss_book_target['Book-Title'][coss_book_target['ISBN'] == '0345339703']))

# If there was more time
* Calculate metric that will enable comparison of diferent aproaches
    * e.g prediction calculation (see http://files.grouplens.org/papers/www10_sarwar.pdf) and calculation of MAE
* try other similarity metrics e.g. pearson correlation  
* find a better way how to deal with implicit ratings
* combine all LotR part 1 books or all LotR books in order to get more data
* underlay value of parameter coss_book_target['num_of_ratings']>10 with some analysis to ensure the right value of parameter in order to get statisticaly significant results
* ratings prediction
* rewrite the code in cleaner way

# Model for any book
* Creat a similarity matrix: rows and clomns: books; values: cosine similarity between coresponding books
    * It will be probably very time consuming. It might make sence to compute the similarity matrix offline rather than in realtime.