In [1]:
# importing libraries
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# reading file
book_description = pd.read_csv('description.csv', encoding = 'latin-1')

In [3]:
# checking if we have the right data
book_description.head()

Unnamed: 0,BOOK NUMBER,BOOK TITLE,DESCRIPTION
0,1,Harry Potter & The Sorcerer's Stone,Harry Potter's life is miserable. His parents ...
1,2,Harry Potter & The Chamber of Secrets,Ever since Harry Potter had come home for the ...
2,3,Harry Potter and The Prisoner of Azkaban,"For twelve long years, the dread fortress of A..."
3,4,Harry Potter and The Goblet of Fire,Harry Potter is midway through his training as...
4,5,Harry Potter and The Order of Phoenix,"Harry has a lot on his mind for this, his fift..."


In [4]:
# removing the stop words
books_tfidf = TfidfVectorizer(stop_words='english')
# filling the missing values with empty string
book_description['DESCRIPTION'] = book_description['DESCRIPTION'].fillna('')
# computing TF-IDF matrix required for calculating cosine similarity
book_description_matrix = books_tfidf.fit_transform(book_description['DESCRIPTION'])

In [5]:
# Let's check the shape of computed matrix
book_description_matrix.shape

(106, 1258)

In [6]:
# computing cosine similarity matrix using linear_kernal of sklearn
cosine_similarity = linear_kernel(book_description_matrix, book_description_matrix)

In [7]:
indices = pd.Series(book_description['BOOK TITLE'].index)

In [8]:
# Function to get the most similar books
def recommend(index, cosine_sim=cosine_similarity):
    id = indices[index]
    # Get the pairwsie similarity scores of all books compared to that book, 
    # sorting them and getting top 5
    similarity_scores = list(enumerate(cosine_sim[id]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:6]

    # Get the books index
    books_index = [i[0] for i in similarity_scores]

    # Return the top 5 most similar books using integer-location based indexing (iloc)
    return book_description['BOOK NUMBER'].iloc[books_index]

In [9]:
# getting recommendation for book at index 2
recommend(2)

6      7
80    81
5      6
72    73
58    59
Name: BOOK NUMBER, dtype: int64

In [10]:
# getting recommendation for book at index 6
recommend(6)

1    2
3    4
5    6
2    3
4    5
Name: BOOK NUMBER, dtype: int64