In [1]:
import pandas as pd

books = pd.read_csv("book.csv",encoding = "ISO-8859-1")
print(books.shape) 
books.columns

(10000, 4)


Index(['Unnamed: 0', 'User.ID', 'Book.Title', 'Book.Rating'], dtype='object')

In [2]:
books.head()

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6


In [3]:
books1 = books.iloc[:,1:]

In [4]:
books1.columns = ["UserID", "Title", "BookRating"]

In [5]:
books1.head()

Unnamed: 0,UserID,Title,BookRating
0,276726,Classical Mythology,5
1,276729,Clara Callan,3
2,276729,Decision in Normandy,6
3,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,276737,The Mummies of Urumchi,6


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer 

# Creating a Tfidf Vectorizer to remove all stop words
tfidf = TfidfVectorizer(stop_words="english")    #taking stop words from tfid vectorizer 

In [7]:
# replacing the NaN values in overview column with
# empty string
books1["Title"].isnull().sum() 
books1["BookRating"].isnull().sum() 
# books1["Title"] = books1["Title"].fillna(" ")   # No null values so ignore

0

In [8]:
# Preparing the Tfidf matrix by fitting and transforming

tfidf_matrix = tfidf.fit_transform(books1.Title)   #Transform a count matrix to a normalized tf or tf-idf representation
print("Shape",tfidf_matrix.shape) 

Shape (10000, 11435)


In [9]:
from sklearn.metrics.pairwise import linear_kernel

# Computing the cosine similarity on Tfidf matrix
cosine_sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix) # cos_sim_matrix is obtained by matrix * matrix
print(cosine_sim_matrix)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [10]:
# books1['Title']= books1['Title'].str.lower()
# books1.Title.value_counts()

In [11]:
# books1.drop_duplicates(subset ="Title",keep='first',inplace = True)
# # books1.reset_index(drop=True,inplace = True)

In [12]:
books1['Title']= books1['Title'].str.lower()
books1_index = pd.Series(books1.index, index = books1['Title']).drop_duplicates()
print(books1_index)

Title
classical mythology                                                                                      0
clara callan                                                                                             1
decision in normandy                                                                                     2
flu: the story of the great influenza pandemic of 1918 and the search for the virus that caused it       3
the mummies of urumchi                                                                                   4
                                                                                                      ... 
american fried: adventures of a happy eater.                                                          9995
cannibal in manhattan                                                                                 9996
how to flirt: a practical guide                                                                       9997
twilight                       

In [13]:
# books1_index = pd.Series(books1.index, index = books1['Title'])
# books1_index.sort_values(ascending=True, inplace=True)
# print(books1_index)

In [14]:
def get_book_recommendations(BookName,topN):
   
    # Getting the Books index using its title 
    books1_id = books1_index[BookName] 
    # print(book_id)
    
     # Getting the pair wise similarity score for all the books with that of eneterd book
    cosine_scores = list(enumerate(cosine_sim_matrix[books1_id])) 
    # print(cosine_scores)
    
    # Sorting the cosine_similarity scores based on scores 
    cosine_scores = sorted(cosine_scores, key=lambda x:x[1], reverse = True)
    # print(cosine_scores)
    
    # Get the scores of topN most similar books 
    cosine_score_TopN = cosine_scores[0:topN]
    print(cosine_score_TopN)

    # Above will only give index number not the book name
    # Getting the book index 
    books1_idx = [i[0] for i in cosine_score_TopN]
    books1_scores = [i[1] for i in cosine_score_TopN]
    
    # Similar books and scores
    books1_similar_show = pd.DataFrame(columns=["Title","Score"])
    
    # using book index we fetch book name from books
    books1_similar_show["Title"] = books1.loc[books1_idx,"Title"] 
    books1_similar_show["Score"]= books1_scores
    books1_similar_show.reset_index(inplace=True) 
    books1_similar_show.drop(["index"],axis=1,inplace=True)
    return books1_similar_show
    

In [15]:
# Enter the book name or index number and number of books to be recommended 
Top_N_Books = get_book_recommendations("kids say the darndest things",10)
Top_N_Books

[(9999, 1.0), (8717, 0.31995947162024035), (8865, 0.31995947162024035), (2321, 0.2931774153982582), (136, 0.27242079808630437), (5474, 0.2676153960100704), (9736, 0.266961078456532), (9428, 0.2610743104335387), (1679, 0.2601987837049816), (5409, 0.2601854861261112)]


Unnamed: 0,Title,Score
0,kids say the darndest things,1.0
1,before i say goodbye,0.319959
2,say no to joe?,0.319959
3,how to say it style guide,0.293177
4,before i say good-bye,0.272421
5,everything cat: what kids really want to know ...,0.267615
6,things not seen,0.266961
7,dinkum oil; meanings and origins of things aus...,0.261074
8,before i say good-bye : a novel,0.260199
9,ufo kids,0.260185
