In [1]:
# importing libraries
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# reading file
book_description = pd.read_csv('description.csv', encoding = 'latin-1')

In [3]:
# checking if we have the right data
book_description.head()

Unnamed: 0,book_id,name,description
0,4833.0,The Glass Castle,"A tender, moving tale of unconditional love in..."
1,590.0,"Night (The Night Trilogy, #1)","Born into a Jewish ghetto in Hungary, as a chi..."
2,4264.0,"Angela's Ashes (Frank McCourt, #1)",Imbued on every page with Frank McCourt's asto...
3,3361.0,"Eat, Pray, Love","A celebrated writer's irresistible, candid, an..."
4,4535.0,Into Thin Air: A Personal Account of the Mount...,A bank of clouds was assembling on the not-so-...


In [4]:
# removing the stop words
books_tfidf = TfidfVectorizer(stop_words='english')
# replace NaN with empty strings
book_description['description'] = book_description['description'].fillna('')
# computing TF-IDF matrix required for calculating cosine similarity
book_description_matrix = books_tfidf.fit_transform(book_description['description'])


In [5]:
# Let's check the shape of computed matrix
book_description_matrix.shape

(143, 4186)

In [6]:
# compuing cosine similarity matrix using linear_kernal of sklearn
cosine_similarity = linear_kernel(book_description_matrix, book_description_matrix)

In [7]:

# Get the pairwsie similarity scores of all books compared to the book passed by index, sorting them and getting top 5
# here 2 is the index of the book in dataset
similarity_scores = list(enumerate(cosine_similarity[2]))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
similarity_scores = similarity_scores[1:6]

# Get the similar books index
books_index = [i[0] for i in similarity_scores]

# printing the top 5 most similar books using integer-location based indexing (iloc)
print (book_description['name'].iloc[books_index])


6                                 Running with Scissors 
29                            The Diary of a Young Girl 
116    It's St. Patrick's Day (Turtleback School & Li...
11     Persepolis: The Story of a Childhood (Persepol...
20     Maus I: A Survivor's Tale: My Father Bleeds Hi...
Name: name, dtype: object
