In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../dataset/goodreads_dataset.csv",sep=",")
df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780000000000.0,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780000000000.0,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780000000000.0,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,43965548,9780000000000.0,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,439682584,9780000000000.0,eng,2690,41428,164,9/13/2004,Scholastic


## First off we will check if there are any missing values in the dataset or not. 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11127 non-null  int64  
 1   title               11127 non-null  object 
 2   authors             11127 non-null  object 
 3   average_rating      11127 non-null  float64
 4   isbn                11127 non-null  object 
 5   isbn13              11127 non-null  float64
 6   language_code       11127 non-null  object 
 7   num_pages           11127 non-null  int64  
 8   ratings_count       11127 non-null  int64  
 9   text_reviews_count  11127 non-null  int64  
 10  publication_date    11127 non-null  object 
 11  publisher           11127 non-null  object 
dtypes: float64(2), int64(4), object(6)
memory usage: 1.0+ MB


## From the dataset it can be seen that there are multiple Authors and Publishers. We will split them in a single list. 

In [4]:
df['authors'] = df['authors'].apply(lambda row: row.split('/'))
df['publisher'] = df['publisher'].apply(lambda row: row.split('/'))

In [5]:
df.head(20)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,"[J.K. Rowling, Mary GrandPré]",4.57,439785960,9780000000000.0,eng,652,2095690,27591,9/16/2006,[Scholastic Inc.]
1,2,Harry Potter and the Order of the Phoenix (Har...,"[J.K. Rowling, Mary GrandPré]",4.49,439358078,9780000000000.0,eng,870,2153167,29221,9/1/2004,[Scholastic Inc.]
2,4,Harry Potter and the Chamber of Secrets (Harry...,[J.K. Rowling],4.42,439554896,9780000000000.0,eng,352,6333,244,11/1/2003,[Scholastic]
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,"[J.K. Rowling, Mary GrandPré]",4.56,43965548,9780000000000.0,eng,435,2339585,36325,5/1/2004,[Scholastic Inc.]
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,"[J.K. Rowling, Mary GrandPré]",4.78,439682584,9780000000000.0,eng,2690,41428,164,9/13/2004,[Scholastic]
5,9,"Unauthorized Harry Potter Book Seven News: ""Ha...",[W. Frederick Zimmerman],3.74,976540606,9780000000000.0,en-US,152,19,1,4/26/2005,[Nimble Books]
6,10,Harry Potter Collection (Harry Potter #1-6),[J.K. Rowling],4.73,439827604,9780000000000.0,eng,3342,28242,808,9/12/2005,[Scholastic]
7,12,The Ultimate Hitchhiker's Guide: Five Complete...,[Douglas Adams],4.38,517226952,9780000000000.0,eng,815,3628,254,11/1/2005,[Gramercy Books]
8,13,The Ultimate Hitchhiker's Guide to the Galaxy ...,[Douglas Adams],4.38,345453743,9780000000000.0,eng,815,249558,4080,4/30/2002,[Del Rey Books]
9,14,The Hitchhiker's Guide to the Galaxy (Hitchhik...,[Douglas Adams],4.22,1400052920,9780000000000.0,eng,215,4930,460,8/3/2004,[Crown]


## There are books in the dataset that are not in English language. For our project we will only work with the English books and discard the foreign language books. 

In [6]:
englishBook = ["en-US", "en-GB", "eng", "en-CA"]
dataBookEng = df[df['language_code'].isin(englishBook)].copy()
dataBookEng.loc[:,'language_code'] = 'english'
dataBookEng['language_code'].value_counts()

english    10541
Name: language_code, dtype: int64

In [7]:
booksEng = dataBookEng.drop(['bookID','isbn','publisher','isbn13','num_pages','ratings_count','average_rating','text_reviews_count','publication_date','language_code'], axis=1)
booksEng.head()

Unnamed: 0,title,authors
0,Harry Potter and the Half-Blood Prince (Harry ...,"[J.K. Rowling, Mary GrandPré]"
1,Harry Potter and the Order of the Phoenix (Har...,"[J.K. Rowling, Mary GrandPré]"
2,Harry Potter and the Chamber of Secrets (Harry...,[J.K. Rowling]
3,Harry Potter and the Prisoner of Azkaban (Harr...,"[J.K. Rowling, Mary GrandPré]"
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,"[J.K. Rowling, Mary GrandPré]"


In [8]:
feature_df = booksEng[['title','authors']].copy()
feature_df.head()

Unnamed: 0,title,authors
0,Harry Potter and the Half-Blood Prince (Harry ...,"[J.K. Rowling, Mary GrandPré]"
1,Harry Potter and the Order of the Phoenix (Har...,"[J.K. Rowling, Mary GrandPré]"
2,Harry Potter and the Chamber of Secrets (Harry...,[J.K. Rowling]
3,Harry Potter and the Prisoner of Azkaban (Harr...,"[J.K. Rowling, Mary GrandPré]"
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,"[J.K. Rowling, Mary GrandPré]"


## Now we will eliminate the whitespaces. 

In [9]:
def elimination(x):
    try:
        if isinstance(x, list):
            return [i.replace(' ','').lower() for i in x]
        else:
            return [x.replace(' ','').lower()]
    except:
        print(x)
        
feature_cols = ['authors']


for col in feature_cols:
    feature_df[col] = feature_df[col].apply(elimination)

In [10]:
feature_df.head()

Unnamed: 0,title,authors
0,Harry Potter and the Half-Blood Prince (Harry ...,"[j.k.rowling, marygrandpré]"
1,Harry Potter and the Order of the Phoenix (Har...,"[j.k.rowling, marygrandpré]"
2,Harry Potter and the Chamber of Secrets (Harry...,[j.k.rowling]
3,Harry Potter and the Prisoner of Azkaban (Harr...,"[j.k.rowling, marygrandpré]"
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,"[j.k.rowling, marygrandpré]"


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(feature_df['title'])

print(count)
print(count_matrix.shape)

CountVectorizer(stop_words='english')
(10541, 10190)


In [13]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

print(cosine_sim)

[[1.         0.76277007 0.76277007 ... 0.         0.         0.        ]
 [0.76277007 1.         0.8        ... 0.         0.         0.        ]
 [0.76277007 0.8        1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [14]:
books_name = list(feature_df.title.values)
def get_id_from_keyword(keyword):
    for name in books_name:
        if keyword in name:
            print("Title: "+name + ",   id:" +str(books_name.index(name)))

In [28]:
indices_id = pd.Series(feature_df['title']).drop_duplicates()
def content_recommender(book_id):
    
    book_id = int(book_id)
    if book_id in indices_id:
        idx = book_id
        
        sim_scores = list(enumerate(cosine_sim[idx]))

        
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        
        sim_scores = sim_scores[1:11]

        
        book_indices = [i[0] for i in sim_scores]
        print("-------------------Recommended Books------------------------")
        
        return booksEng.iloc[book_indices]
    else:
        print("Sorry! No Similar Books Found. Try Again")

In [29]:
print("*"*20)

title = input("Enter Keyword...\n")
print("Here are all the books containing your keyword.\n")
print("-"*20)
get_id_from_keyword(title)

x = input("Did you find the book you mean? [yes/no]\n")
if x.lower()== 'yes':
    book_id = input("Input the Book ID:\n")
    print(content_recommender(book_id))
else:
    print("Sorry! We Will Update Our Collection.")

********************
Enter Keyword...
Business
Here are all the books containing your keyword.

--------------------
Title: Power of an Hour: Business and Life Mastery in One Hour a Week,   id:75
Title: How to Buy  Sell & Profit on eBay: Kick-Start Your Home-Based Business in Just Thirty Days,   id:77
Title: Starting an eBay Business for Dummies,   id:80
Title: eBay Business All-in-One Desk Reference for Dummies,   id:83
Title: Monkey Business: True Story of the Scopes Trial,   id:327
Title: Junie B. Jones and a Little Monkey Business (Junie B. Jones  #2),   id:328
Title: Monkey Business,   id:329
Title: Sun Tzu and the Art of Business: Six Strategic Principles for Managers,   id:747
Title: The Innovator's Dilemma: The Revolutionary Book that Will Change the Way You Do Business,   id:751
Title: My Movie Business: A Memoir,   id:1877
Title: Trouble Is My Business,   id:2928
Title: What You Think of Me Is None of My Business,   id:3536
Title: Naked Conversations: How Blogs Are Changing t