In [2]:
#importing the required libraries
#Data wrangling
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth",1000)#setting maximum column width
#data visualization
import seaborn as sns
import json
sns.set_style('white')
import matplotlib.pyplot as plt
#To create wordcloud
plt.rcParams["figure.figsize"] = (8,8)

In [3]:
#loading the required datasets
books_df = pd.read_json('./data/clean_book_data.jl', lines=True)

# Open the JSON file and read each line
records = []
with open('./data/clean_rate_data.jl', 'r') as file:
    for line in file:
        # Load the JSON data from each line
        record = json.loads(line)

        # Append the record to the list
        records.append(record)

rating_df = pd.DataFrame(records)

## **Merging Datasets**

In [4]:
books_df.drop(['description'], axis=1, inplace = True)

In [5]:
books_df.head(5)

Unnamed: 0,url,title,genres,author,publishYear,ratingHistogram,avgRating,ratingsCount,reviewsCount,numPages,language
0,https://www.goodreads.com/book/show/2165.The_Old_Man_and_the_Sea,The Old Man and the Sea,"['American', 'Classic Literature', 'Novels', '20th Century', 'Fiction', 'Literature', 'School', 'Literary Fiction', 'Classics', 'Adventure']\nCategories (10, object): ['20th Century', 'Adventure', 'American', 'Classic Literature', ..., 'Literary Fiction', 'Literature', 'Novels', 'School']",Ernest Hemingway,1996,"[47629, 91400, 253242, 362587, 355869]",3.8,1110727,37875,96,English
1,https://www.goodreads.com/book/show/10507293-the-selection,The Selection,"['Fantasy', 'Teen', 'Fiction', 'Science Fiction', 'Young Adult', 'Young Adult Fantasy', 'Chick Lit', 'Romance', 'Dystopia', 'Audiobook']\nCategories (10, object): ['Audiobook', 'Chick Lit', 'Dystopia', 'Fantasy', ..., 'Science Fiction', 'Teen', 'Young Adult', 'Young Adult Fantasy']",Kiera Cass,2012,"[41180, 78962, 255902, 450022, 662382]",4.08,1488448,76015,336,English
2,https://www.goodreads.com/book/show/5148.A_Separate_Peace,A Separate Peace,"['Historical Fiction', 'Novels', 'Fiction', 'High School', 'Young Adult', 'School', 'Literature', 'Read For School', 'Classics', 'Coming Of Age']\nCategories (10, object): ['Classics', 'Coming Of Age', 'Fiction', 'High School', ..., 'Novels', 'Read For School', 'School', 'Young Adult']",John Knowles,2003,"[10462, 23486, 60130, 71889, 50231]",3.59,216198,9328,208,English
3,https://www.goodreads.com/book/show/7747374-i-am-number-four,I Am Number Four,"['Fantasy', 'Teen', 'Fiction', 'Science Fiction', 'Young Adult', 'Paranormal', 'Aliens', 'Romance', 'Dystopia', 'Adventure']\nCategories (10, object): ['Adventure', 'Aliens', 'Dystopia', 'Fantasy', ..., 'Romance', 'Science Fiction', 'Teen', 'Young Adult']",Pittacus Lore,2010,"[8478, 22247, 73239, 113790, 121216]",3.94,338970,16820,440,English
4,https://www.goodreads.com/book/show/162898.A_Connecticut_Yankee_in_King_Arthur_s_Court,A Connecticut Yankee in King Arthur's Court,"['Historical Fiction', 'Fantasy', 'Fiction', 'Science Fiction', 'Humor', 'Literature', 'Time Travel', 'Classics', 'Arthurian', 'Adventure']\nCategories (10, object): ['Adventure', 'Arthurian', 'Classics', 'Fantasy', ..., 'Humor', 'Literature', 'Science Fiction', 'Time Travel']",Mark Twain,2007,"[2641, 7501, 27761, 37493, 27231]",3.77,102627,3829,480,English


In [6]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443077 entries, 0 to 443076
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   book_url     443077 non-null  object
 1   title        443077 non-null  object
 2   user         443077 non-null  object
 3   user_rate    443077 non-null  int64 
 4   user_review  443077 non-null  object
dtypes: int64(1), object(4)
memory usage: 16.9+ MB


In [7]:
rating_df.drop(['title'], axis=1, inplace = True)
rating_df.rename(columns={'book_url': 'url'}, inplace=True)

In [8]:
# Assuming books_df and rating_df are defined and loaded with data
df = pd.merge(books_df, rating_df, on='url', how='inner')

In [9]:
# factorize book_id
df['book_id'], _ = pd.factorize(df['url'])

# Drop the original columns
df = df.drop(['url'], axis=1)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415679 entries, 0 to 415678
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   title            415679 non-null  object 
 1   genres           415679 non-null  object 
 2   author           415679 non-null  object 
 3   publishYear      415679 non-null  int64  
 4   ratingHistogram  415679 non-null  object 
 5   avgRating        415679 non-null  float64
 6   ratingsCount     415679 non-null  int64  
 7   reviewsCount     415679 non-null  int64  
 8   numPages         415679 non-null  int64  
 9   language         415679 non-null  object 
 10  user             415679 non-null  object 
 11  user_rate        415679 non-null  int64  
 12  user_review      415679 non-null  object 
 13  book_id          415679 non-null  int64  
dtypes: float64(1), int64(6), object(7)
memory usage: 44.4+ MB


## **Memory Based Approach**

### **KNN Based Algorithm**

In [11]:
# we kept running into memory issues, so decided to reduce some data by filtering things out
# focussing on users with more than 3 ratings and top 10% most frequently rated books
required_ratings = 3

user = df['user'].value_counts()
user_list = user[user >required_ratings].index.to_list()
filter_df = df[df['user'].isin(user_list)]

print('Number of users with ratings more than 3 are: {}'.format(filter_df.shape[0]))

Number of users with ratings more than 3 are: 250473


In [12]:
filter_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250473 entries, 1 to 415678
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   title            250473 non-null  object 
 1   genres           250473 non-null  object 
 2   author           250473 non-null  object 
 3   publishYear      250473 non-null  int64  
 4   ratingHistogram  250473 non-null  object 
 5   avgRating        250473 non-null  float64
 6   ratingsCount     250473 non-null  int64  
 7   reviewsCount     250473 non-null  int64  
 8   numPages         250473 non-null  int64  
 9   language         250473 non-null  object 
 10  user             250473 non-null  object 
 11  user_rate        250473 non-null  int64  
 12  user_review      250473 non-null  object 
 13  book_id          250473 non-null  int64  
dtypes: float64(1), int64(6), object(7)
memory usage: 28.7+ MB


In [13]:
# creating a pivot table
table = filter_df.pivot_table(columns='user', index='title', values='user_rate')
table

user,1-otis-chandler,1000163-laura,10001905-lynn,100019622-vonda,10007258,100078172-nad-gandia,1000903-james,10010139-andy-kornylo,100118640-dwayne,10014356-virginie-roy,...,99845669-s-van-sardar,99853253-william,9986049-sully-sully-reads,9986918-eric,9989803-patricia,999171-anita,99920177-janet-roger,999233-laura,9992977-allison,9998105-stephanie
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Look Here, Sir, What a Curious Bird"": Searching for Ali, Alfred Russel Wallace's Faithful Companion",,,,,,,,,,,...,,,,,,,,,,
"""Surely You're Joking, Mr. Feynman!"": Adventures of a Curious Character",5.0,,,,,,,,,,...,,,,,,,,,,
#Girlboss,,,,,,,,,,,...,,,,,,,,,,
'Til the Well Runs Dry,,,,,,,,,,,...,,,,,,,,,,
'Tis,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
コンビニ人間 [Konbini ningen],,,,,,,,,,,...,,,,,,,,,,
"ヒストリエ 1 [Historie, Vol. 1]",,,,,,,,,,,...,,,,,,,,,,
天官赐福 [Tiān Guān Cì Fú],,,,,,,,,,,...,,,,,,,,,,
美少女戦士セーラームーン新装版 1,,,,,,,,,,,...,,,,,,,,,,


In [14]:
# filling null values
table.fillna(0, inplace=True)
table

user,1-otis-chandler,1000163-laura,10001905-lynn,100019622-vonda,10007258,100078172-nad-gandia,1000903-james,10010139-andy-kornylo,100118640-dwayne,10014356-virginie-roy,...,99845669-s-van-sardar,99853253-william,9986049-sully-sully-reads,9986918-eric,9989803-patricia,999171-anita,99920177-janet-roger,999233-laura,9992977-allison,9998105-stephanie
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Look Here, Sir, What a Curious Bird"": Searching for Ali, Alfred Russel Wallace's Faithful Companion",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""Surely You're Joking, Mr. Feynman!"": Adventures of a Curious Character",5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#Girlboss,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til the Well Runs Dry,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Tis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
コンビニ人間 [Konbini ningen],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"ヒストリエ 1 [Historie, Vol. 1]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
天官赐福 [Tiān Guān Cì Fú],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
美少女戦士セーラームーン新装版 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# converting to sparse matrix
from scipy.sparse import csr_matrix

sparse = csr_matrix(table)
sparse

<15520x16876 sparse matrix of type '<class 'numpy.float64'>'
	with 247979 stored elements in Compressed Sparse Row format>

In [16]:
#Creating an instance of KNN
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(algorithm='brute')
model.fit(sparse)

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# function to get recommendations

def get_recommendations(name, n):

  # getting book id
  book_id= np.where(table.index==name)[0][0]

  # distances and suggestions based on similarity
  distances, suggestions = model.kneighbors(table.iloc[book_id, :].values.reshape(1,-1),n_neighbors=n+1)
  suggestions = suggestions.flatten().tolist()

  for i in range(len(suggestions)):
    # if book is same as input
    if i==0:
      print('The top {} Recommended books for {} are:\n'.format(n, name))
    else:
      print(table.index[suggestions[i]])
  return

In [None]:
get_recommendations('The Selection', 10)

The top 10 Recommended books for The Selection are:

Songs of My Selfie: An Anthology of Millennial Stories
Loudmouth
Devil's Prize
Dreams from Nepal: The Emotional Story of a Twelve-Year-Old Nepali Boy
Ghost in the Net
Chanters Chase
Orchard of Dust
I'm in Love with a Stripper
The Monarch
Daisy Buchanan's Daughter


### **KNN with cosine metric**

In [None]:
#Creating an instance of KNN with cosine metric

model_cosine = NearestNeighbors(metric='cosine', algorithm='brute')
model_cosine.fit(sparse)

In [None]:
# function to get recommendations

def get_cosine_recommendations(name, n):

  print('Cosine Similarity based recommendations.\n')

  # distances and indices based on similarity
  distances, indices = model_cosine.kneighbors(table.loc[name].values.reshape(1, -1), n_neighbors = n+1)

  for i in range(len(distances.flatten())):
    if i==0:
      print('The top {} Recommended books for {} are:\n'.format(n, name))
    else:
      print(table.index[indices.flatten()[i]])
  return

In [None]:
get_cosine_recommendations('The Selection', 10)

Cosine Similarity based recommendations.

The top 10 Recommended books for The Selection are:

The One
The American Roommate Experiment
It Starts with Us
Icebreaker
Eliza and Her Monsters
The Spanish Love Deception
Daisy Jones & The Six
Siege and Storm
Songs of My Selfie: An Anthology of Millennial Stories
Queen of Shadows


## **Model Based Approach**


---


##  **Singular Value Decomposition**

In [None]:
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds
import random
import math

###**Filtering the number of books and users**

In [None]:
#Filtering books with more than 5 reviews

print('The number of books that are explicitely rated are',filter_df['book_id'].nunique())
ratings_count_df=filter_df.groupby("book_id")['user'].count().to_frame('No-of-rated-users').reset_index()
selected_books =list(ratings_count_df.loc[ratings_count_df['No-of-rated-users']>5,'book_id'].unique())
print('Number of  books rated by atleast 5 users:',len(selected_books))
filter_df=filter_df.loc[filter_df['book_id'].isin(selected_books)]

The number of books that are explicitely rated are 15520
Number of  books rated by atleast 5 users: 12862


In [None]:
#keeping books with selected users
print('The number of users who have explicitely rated books are',filter_df['user'].nunique())

#keeps Users who have rated more than five books
books_count_df = filter_df.groupby("user")['book_id'].count().to_frame('No-of-books-rated').reset_index()
selected_users = list(books_count_df.loc[books_count_df['No-of-books-rated']>5,'user'].unique())
print('Number of  users who have rated atleast 5 books are :',len(selected_users))

#dataframe with filtered number of interactions
filter_df = filter_df.loc[filter_df['user'].isin(selected_users)]
print('The shape of data fame with filtered number of interactions : ', filter_df.shape)

The number of users who have explicitely rated books are 16837
Number of  users who have rated atleast 5 books are : 9759
The shape of data fame with filtered number of interactions :  (214137, 14)


In [None]:
complete_df = filter_df[['book_id', 'user', 'user_rate']].copy()


complete_df['user_rate'].describe()

count    214137.000000
mean          3.916301
std           1.042282
min           1.000000
25%           3.000000
50%           4.000000
75%           5.000000
max           5.000000
Name: user_rate, dtype: float64

In [None]:
complete_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214137 entries, 1 to 415678
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   book_id    214137 non-null  int64 
 1   user       214137 non-null  object
 2   user_rate  214137 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 6.5+ MB


In [None]:
def smooth_user_preference(x):
    '''Function to smooth column'''
    return math.log(1+x, 2)
#applying function
# complete_df['user_rate']= complete_df['user_rate'].apply(smooth_user_preference)
complete_df.head()

Unnamed: 0,book_id,user,user_rate
1,0,4074594-kiki,3
2,0,2745288-wendy-darling,1
3,0,32879029-emma,1
4,0,4622890-emily-may,1
5,0,71848701-miranda-reads,3


In [None]:
train_df, test_df = train_test_split(complete_df,
                                   stratify=complete_df['user'],
                                   test_size=0.20,
                                   random_state=0)


print('# interactions on Train set: %d' % len(train_df))
print('# interactions on Test set: %d' % len(test_df))

# interactions on Train set: 171309
# interactions on Test set: 42828


In [None]:
#displaying the first 5 rows of test set
test_df.head()

Unnamed: 0,book_id,user,user_rate
289445,11046,16690007-rosa-dracos99,4
93155,3210,11971939-jamie,5
284332,10840,1646501-bookaddict,4
333441,12878,4828849-michael-burnam-fink,3
414921,16337,14100063-cheryl-james,5


In [None]:
#Creating a sparse pivot table with users in rows and ISBN number of books in columns
users_books_pivot_matrix_df = train_df.pivot_table(index='user',
                                                   columns='book_id',
                                                   values='user_rate',
                                                   aggfunc='mean'
                                                   ).fillna(0)

users_books_pivot_matrix_df.head()

book_id,0,1,2,3,4,5,6,7,8,9,...,16355,16357,16358,16359,16360,16361,16362,16363,16365,16366
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-otis-chandler,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100019622-vonda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100078172-nad-gandia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100118640-dwayne,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10014356-virginie-roy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Creating a matrix with the values of users_books_pivot_matrix_df
original_ratings_matrix = users_books_pivot_matrix_df.values
original_ratings_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
#Storing the User-IDs in a list
user_ids = list(users_books_pivot_matrix_df.index)
user_ids[:10]

['1-otis-chandler',
 '100019622-vonda',
 '100078172-nad-gandia',
 '100118640-dwayne',
 '10014356-virginie-roy',
 '10017183-em',
 '100210843-marisa-mu-oz',
 '1002421-danae',
 '10025094-linda-smith',
 '10031062-krista']

In [None]:
# The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 20

#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(original_ratings_matrix, k = NUMBER_OF_FACTORS_MF)

In [None]:
#converting sigma to a diagonal matrix
sigma = np.diag(sigma)

 After the factorization, we try to to reconstruct the original matrix by multiplying its factors. The resulting matrix is not sparse any more. It has generated rating predictions for books with which users have not yet interacted (and therefore not rated), which we will use to recommend relevant books to the user.

In [None]:
#Rating matric reconstructed using the matrices obtained after factorizing
predicted_ratings_matrix = np.dot(np.dot(U, sigma), Vt)
predicted_ratings_matrix

array([[ 6.39203545e-04,  2.75616754e-02,  2.51610960e-04, ...,
        -7.55478264e-04,  5.73179035e-04, -4.32106726e-03],
       [ 4.35518803e-03, -8.77660686e-03, -5.02172131e-03, ...,
        -1.31894278e-03,  2.32367345e-04, -1.48103647e-02],
       [-1.98849445e-03,  6.28825977e-03,  3.50001900e-03, ...,
         4.32531233e-05, -3.01017209e-06,  9.07207324e-03],
       ...,
       [-1.67166362e-04,  1.54103390e-02,  1.73378242e-02, ...,
         8.42377344e-04,  2.92987649e-04, -1.82182816e-03],
       [ 9.05032000e-06, -1.72374335e-04,  5.66147596e-05, ...,
         7.02469561e-05,  9.37619802e-06, -2.29016753e-05],
       [-9.94783190e-04, -3.20409427e-03,  1.85346532e-03, ...,
         6.41996529e-04,  1.11066780e-04, -1.23193488e-03]])

In [None]:
#Converting the reconstructed matrix back to a Pandas dataframe
predicted_ratings_df = pd.DataFrame(predicted_ratings_matrix , columns = users_books_pivot_matrix_df.columns, index=user_ids).transpose()
predicted_ratings_df.head()

Unnamed: 0_level_0,1-otis-chandler,100019622-vonda,100078172-nad-gandia,100118640-dwayne,10014356-virginie-roy,10017183-em,100210843-marisa-mu-oz,1002421-danae,10025094-linda-smith,10031062-krista,...,9972516-angela-risner,99747-chris,99765491-jimz,99766625-l,9978483-matthew,9984116-neda,99853253-william,9986049-sully-sully-reads,9989803-patricia,999233-laura
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000639,0.004355,-0.001988,-0.000443,-0.000926,-0.002165,-0.000334,-0.001499,0.000276,0.000202,...,0.001093,9e-05,-0.003405,3.2e-05,-6.5e-05,-0.003459,0.003351,-0.000167,9e-06,-0.000995
1,0.027562,-0.008777,0.006288,0.009549,0.010584,0.006142,0.002051,-0.004017,-0.006504,-0.001222,...,-0.001918,-0.004359,0.09075,-0.005861,0.000531,0.000593,-0.014845,0.01541,-0.000172,-0.003204
2,0.000252,-0.005022,0.0035,-0.004377,0.003532,0.000823,0.000171,0.000381,-0.000592,-0.001537,...,0.000448,-0.001474,-0.009437,-0.000621,0.002477,0.008959,-0.007434,0.017338,5.7e-05,0.001853
3,0.029927,0.000503,0.007066,0.002237,-0.010891,-0.008032,0.001476,0.001254,0.008103,0.001069,...,0.002313,0.00601,-0.009389,0.009225,0.008564,0.004883,0.044652,-0.005922,0.00013,0.001991
4,0.076736,0.012195,0.001279,0.034689,0.008785,-0.002277,-0.002357,0.015254,0.008659,0.009657,...,0.002937,0.006593,0.051705,0.000637,0.00839,0.016754,0.016307,-0.010095,-2.6e-05,0.003812


In [None]:
filter_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214137 entries, 1 to 415678
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   title            214137 non-null  object 
 1   genres           214137 non-null  object 
 2   author           214137 non-null  object 
 3   publishYear      214137 non-null  int64  
 4   ratingHistogram  214137 non-null  object 
 5   avgRating        214137 non-null  float64
 6   ratingsCount     214137 non-null  int64  
 7   reviewsCount     214137 non-null  int64  
 8   numPages         214137 non-null  int64  
 9   language         214137 non-null  object 
 10  user             214137 non-null  object 
 11  user_rate        214137 non-null  int64  
 12  user_review      214137 non-null  object 
 13  book_id          214137 non-null  int64  
dtypes: float64(1), int64(6), object(7)
memory usage: 32.6+ MB


In [None]:
df_no_duplicates = filter_df.drop_duplicates(subset=['book_id'])

In [None]:
class CFRecommender:
    #Storing model name
    MODEL_NAME = 'Collaborative Filtering'

    def __init__(self, cf_predictions_df, items_df=None):
        #Creating attributes
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df

    def get_model_name(self):
        '''This will return model name'''
        return self.MODEL_NAME

    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'user_rate'})

        # Recommend the highest predicted rating content that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['book_id'].isin(items_to_ignore)].sort_values('user_rate', ascending = False).head(topn)

        if verbose:
            #runs only if verbose=True
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')
            #Merging
            recommendations_df = recommendations_df.merge(self.items_df, how = 'left',
                                                          left_on = 'book_id',
                                                          right_on = 'book_id')[['book_id','title',	'author','publishYear']]

        return recommendations_df

#Creating object of the class
cf_recommender_model = CFRecommender(predicted_ratings_df, df_no_duplicates)

In [None]:
def get_items_interacted(person_id, interactions_df):
    '''
    This function will take user id as input and return interacted items
    '''
    interacted_items = interactions_df.loc[person_id]['book_id']
    #Repetation is avoided by taking set
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [None]:
#Indexing by personId to speed up the searches during evaluation
full_indexed_df =complete_df.set_index('user')
train_indexed_df = train_df.set_index('user')
test_indexed_df = test_df.set_index('user')

Let's predict the relevant books for an user.Before that,let's see the list of books that were already rated/purchased by this user.

In [None]:
#The books that were already rated by this user
print(f"These are  books that the user {complete_df['user'][3]} has already rated \n")
df_no_duplicates.loc[df_no_duplicates['book_id'].isin(list(get_items_interacted(complete_df['user'][3],train_indexed_df)))]['title']

These are  books that the user 32879029-emma has already rated 



149             Mansfield Park
177            The Hate U Give
206           A Man Called Ove
383             Rule of Wolves
849                  Renegades
                  ...         
362426          Saga, Volume 3
364150    The Bungalow Mystery
379878    The Box in the Woods
396451             You Love Me
412024        Good Rich People
Name: title, Length: 367, dtype: object

In [None]:
#Recommendation for a single user
print(f"Recommending books for User ID: {complete_df['user'][9]} ")
cf_recommender_model.recommend_items(complete_df['user'][9],items_to_ignore= get_items_interacted(complete_df['user'][9],train_indexed_df),verbose=True)

Recommending books for User ID: 31027440-lala-booksandlala 


Unnamed: 0,book_id,title,author,publishYear
0,13513,Little Secrets,Jennifer Hillier,2020
1,6,The Hate U Give,Angie Thomas,2017
2,1517,The Kiss Quotient,Helen Hoang,2018
3,1607,Sadie,Courtney Summers,2018
4,14110,Jar of Hearts,Jennifer Hillier,2018
5,13528,Rust & Stardust,T. Greenwood,2018
6,2158,The Dry,Jane Harper,2016
7,1778,History Is All You Left Me,Adam Silvera,2017
8,2641,Recursion,Blake Crouch,2019
9,555,The Cruel Prince,Holly Black,2018
