In [1]:
#importing pandas
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading dataset into a Pandas dataframe

df = pd.read_csv("books.csv", on_bad_lines = "skip")

In [3]:
#describing the dataset
df.describe()

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11123.0,11123.0,11123.0,11123.0,11123.0,11123.0
mean,21310.856963,3.934075,9759880000000.0,336.405556,17942.85,542.048099
std,13094.727252,0.350485,442975800000.0,241.152626,112499.2,2576.619589
min,1.0,0.0,8987060000.0,0.0,0.0,0.0
25%,10277.5,3.77,9780345000000.0,192.0,104.0,9.0
50%,20287.0,3.96,9780582000000.0,299.0,745.0,47.0
75%,32104.5,4.14,9780872000000.0,416.0,5000.5,238.0
max,45641.0,5.0,9790008000000.0,6576.0,4597666.0,94265.0


In [4]:
df.head(2).transpose()

Unnamed: 0,0,1
bookID,1,2
title,Harry Potter and the Half-Blood Prince (Harry ...,Harry Potter and the Order of the Phoenix (Har...
authors,J.K. Rowling/Mary GrandPré,J.K. Rowling/Mary GrandPré
average_rating,4.57,4.49
isbn,0439785960,0439358078
isbn13,9780439785969,9780439358071
language_code,eng,eng
num_pages,652,870
ratings_count,2095690,2153167
text_reviews_count,27591,29221


In [5]:
#finding highest rated books
top5Recommendations = df.sort_values(by = 'average_rating',
                                     ascending = False).head(5)
top5Recommendations

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
624,2034,Comoediae 1: Acharenses/Equites/Nubes/Vespae/P...,Aristophanes/F.W. Hall/W.M. Geldart,5.0,198145047,9780198145042,grc,364,0,0,2/22/1922,Oxford University Press USA
9893,39829,His Princess Devotional: A Royal Encounter Wit...,Sheri Rose Shepherd,5.0,1590529626,9781590529621,eng,240,2,0,10/16/2007,Multnomah
4788,17224,The Diamond Color Meditation: Color Pathway to...,John Diamond,5.0,1890995525,9781890995522,eng,74,5,3,2/1/2006,Square One Publishers
9324,36853,Tyrannosaurus Wrecks (Stanley #1),Laura Driscoll/Alisa Klayman-Grodsky/Eric ...,5.0,786845031,9780786845033,eng,24,2,1,2/1/2003,Disney Press
9720,38804,The Irish Anatomist: A Study of Flann O'Brien,Keith Donohue,5.0,1930901356,9781930901353,eng,222,1,0,7/25/2003,Academica Press


In [6]:
#finding highest weighted ranked (by ratings count)books in dataset
top5Votes = df.sort_values(by = 'ratings_count',ascending = False).head(5)
top5Votes

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
10336,41865,Twilight (Twilight #1),Stephenie Meyer,3.59,0316015849,9780316015844,eng,501,4597666,94265,9/6/2006,Little Brown and Company
1697,5907,The Hobbit or There and Back Again,J.R.R. Tolkien,4.27,0618260307,9780618260300,eng,366,2530894,32871,8/15/2002,Houghton Mifflin
1462,5107,The Catcher in the Rye,J.D. Salinger,3.8,0316769177,9780316769174,eng,277,2457092,43499,1/30/2001,Back Bay Books
307,960,Angels & Demons (Robert Langdon #1),Dan Brown,3.89,1416524797,9781416524793,eng,736,2418736,21303,4/1/2006,Pocket Books
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.


In [7]:
#finding top 5 books based on text reviews
top5textreviews = df.sort_values(by = 'text_reviews_count',ascending = False).head(5)
top5textreviews

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
10336,41865,Twilight (Twilight #1),Stephenie Meyer,3.59,316015849,9780316015844,eng,501,4597666,94265,9/6/2006,Little Brown and Company
5270,19063,The Book Thief,Markus Zusak/Cao Xuân Việt Khương,4.37,375831002,9780375831003,eng,552,1516367,86881,3/14/2006,Alfred A. Knopf
1069,3636,The Giver (The Giver #1),Lois Lowry,4.13,385732554,9780385732550,eng,208,1585589,56604,1/24/2006,Ember
284,865,The Alchemist,Paulo Coelho/Alan R. Clarke/Özdemir İnce,3.86,61122416,9780061122415,eng,197,1631221,55843,5/1/1993,HarperCollins
10700,43641,Water for Elephants,Sara Gruen,4.09,1565125606,9781565125605,eng,335,1260027,52759,5/1/2007,Algonquin Books


In [8]:
#listing columns
df.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

In [9]:
#creating a function named Popularity Recommender
def popularityRecommender(df):
    
    #Define the minimum ratings count
    minimum_ratings_count = 0.85* df['ratings_count'].max()
    
    #Define the minimum text reviews count
    minimum_text_count = 0.85* df['text_reviews_count'].max()
    
    
    #the mean rating
    mean_rating = df['average_rating'].mean()

    #both 'ratings_count' and 'text_reviews_count' has been used towards calculating 'weighted_rating'
    df['weighted_rating'] = (((df['ratings_count']/(df['ratings_count']+minimum_ratings_count)) * df['average_rating']) +
                             ((minimum_ratings_count/(df['ratings_count']+minimum_ratings_count))*mean_rating) + (df['text_reviews_count']/(df['text_reviews_count']+minimum_text_count)) * df['average_rating']) + ((minimum_ratings_count/(df['text_reviews_count']+minimum_text_count))*mean_rating)

    recommendations = df.sort_values(by = 'weighted_rating',ascending = False).head(5)
    
    return(recommendations) 

In [10]:
#getting top 5 recommended books
top5 = popularityRecommender(df)
top5[["title",'ratings_count','text_reviews_count','average_rating','weighted_rating',]].head(5)

Unnamed: 0,title,ratings_count,text_reviews_count,average_rating,weighted_rating
5392,A Guide to the Words of My Perfect Teacher,79,0,4.61,195.814049
1720,混血王子的背叛 (哈利波特 #6),75,0,4.57,195.814048
9135,The Beatles Complete - Updated Edition,64,0,4.47,195.814044
6257,The J.R.R. Tolkien Companion and Guide,45,0,4.64,195.814043
9133,The Beatles' Story on Capitol Records Part Tw...,29,0,4.72,195.814041


In [11]:
#CONTENT BASED RECOMMENDER

from sklearn.feature_extraction.text import TfidfVectorizer
cbr = TfidfVectorizer(stop_words = 'english')

# Replace empty descriptions with a blank "" value and transform the titles of books in our dataset into the matrix
df['title'] = df['title'].fillna('')
tfidf_matrix = cbr.fit_transform(df['title'])

tfidf_matrix.shape

(11123, 11090)

In [12]:
df['title'][0]

'Harry Potter and the Half-Blood Prince (Harry Potter  #6)'

In [13]:
# Assign the instance of our recommender function.
# This is a matrix with a similarity value for every book with every other book in the dataset

from sklearn.metrics.pairwise import cosine_similarity
distance_matrix = cosine_similarity(tfidf_matrix)

# Re-create the indices of our list of books by removing any duplicates if required
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [14]:
# Define a function that takes the re-indexed dataset, finds the 6 most similar titles 
#to a chosen title based on the
# similarity of the words in the titles,
# and returns the top 5, 

def ContentBasedRecommender(title, indices, distance_matrix):
    id_ = indices[title] #Fetch the index of the books we will enter
    
    #List of tuples with distance for each book to the entered book
    distances = list(enumerate(distance_matrix[id_])) 
    
    #sort by the distance function, which is in column[1]
    distances = sorted(distances, key=lambda x: x[1], reverse = True) 
    
    distances = distances[1:6] # Get the 5 best scores , not including itself
    print(distances)
    
    # get the indices of the top 5
    recommendations = [distance[0] for distance in distances] 
    
    # return those recommendation names by pulling title from the given 5 indices
    return df['title'].iloc[recommendations] 

In [15]:

import numpy as np

random_books_index = np.random.randint(low=0, high=len(df), size=1).astype(int)
print(random_books_index[0])

title = df["title"][random_books_index[0]]
print(title)

ContentBasedRecommender(title, indices, distance_matrix)



8863
Whitethorn Woods
[(5069, 0.6236460825554155), (1015, 0.4409843740353598), (10661, 0.3991236034623825), (2303, 0.39843135150516346), (2291, 0.3461940955434848)]


5069                  Out of the Woods
1015          In the Lake of the Woods
10661    Into the Woods (De Beers  #4)
2303        Christmas in the Big Woods
2291      Winter Days in the Big Woods
Name: title, dtype: object