In [None]:
import pandas as pd


df = pd.read_csv('../data/books_summary.csv')

df.head()

Unnamed: 0,Title,Author,Image_URL,Summary
0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit..."
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,https://i.gr-assets.com/images/S/compressed.ph...,Harry Potter is about to start his fifth year ...
2,Pride and Prejudice,Jane Austen,https://i.gr-assets.com/images/S/compressed.ph...,"Since its immediate success in 1813, Pride and..."
3,To Kill a Mockingbird,Harper Lee,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...
4,The Book Thief,Markus Zusak,https://i.gr-assets.com/images/S/compressed.ph...,Librarian's note: An alternate cover edition c...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Summary'])

print("TF-IDF Vector size : ", tfidf_matrix.shape)

TF-IDF Vector size :  (100, 3693)


In [4]:
from sklearn.metrics.pairwise import cosine_similarity


cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

book_indices = pd.Series(df.index, index=df['Title']).drop_duplicates()


In [32]:
book_indices['The Hunger Games (The Hunger Games, #1)']

0

In [34]:
cosine_sim.shape

(100, 100)

In [5]:
list(enumerate(cosine_sim[book_indices['The Hunger Games (The Hunger Games, #1)']]))

[(0, 1.0000000000000002),
 (1, 0.03381437012769274),
 (2, 0.00994114485878779),
 (3, 0.017321764904272945),
 (4, 0.015372799919494256),
 (5, 0.029238979684375523),
 (6, 0.008494178999704139),
 (7, 0.0022024193805789682),
 (8, 0.013626055649018434),
 (9, 0.0036276313129081257),
 (10, 0.002400454489029344),
 (11, 0.024809546110202217),
 (12, 0.001757824283672759),
 (13, 0.0),
 (14, 0.010525849528176083),
 (15, 0.04039404837721644),
 (16, 0.04030701099784537),
 (17, 0.029545265402630527),
 (18, 0.014447679649288706),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.06648660806580924),
 (23, 0.011793545763751342),
 (24, 0.019155929743858678),
 (25, 0.011566477103239757),
 (26, 0.07659528456531214),
 (27, 0.0),
 (28, 0.04500900935040407),
 (29, 0.012605670209134879),
 (30, 0.028484412063072756),
 (31, 0.02790240816770439),
 (32, 0.0),
 (33, 0.025516643131398623),
 (34, 0.020006602206800873),
 (35, 0.002146688850734836),
 (36, 0.027460374363870832),
 (37, 0.018354512956704355),
 (38, 0.0310203261

In [6]:
def recommend_books(title, top_n=5):
  sim_scores = list(enumerate(cosine_sim[book_indices[title]]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1: top_n+1]

  book_indices_list = [i[0] for i in sim_scores]
  return df.iloc[book_indices_list][['Title', 'Author']]

recommend_books('The Great Gatsby')

Unnamed: 0,Title,Author
12,Wuthering Heights,Emily Brontë
52,Frankenstein: The 1818 Text,Mary Wollstonecraft Shelley
42,"Little Women (Little Women, #1)",Louisa May Alcott
4,The Book Thief,Markus Zusak
33,Charlotte’s Web,E.B. White
