In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import ast

In [4]:
books_df = pd.read_csv('./books_summary.csv.zip', index_col=[0])

In [5]:
books_df.head(10)

Unnamed: 0,book_name,summaries,categories
0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,Brave New World,presents a futuristic society engineered perf...,science
4,1984,is the story of a man questioning the system ...,science
5,Stolen Focus,explains why our attention spans have been dw...,science
6,The Life-Changing Science of Detecting Bullshit,teaches its readers how to avoid falling for ...,science
7,Dopamine Nation,talks about the importance of living a balance...,science
8,The Art of Statistics,is a non-technical book that shows how statis...,science
9,No Self No Problem,is a provocative read about the implications ...,science


In [6]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5201 entries, 0 to 5244
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   book_name   5201 non-null   object
 1   summaries   5194 non-null   object
 2   categories  5201 non-null   object
dtypes: object(3)
memory usage: 162.5+ KB


In [7]:
books_df.shape

(5201, 3)

In [8]:
books_df.categories.value_counts()

categories
psychology       595
productivity     485
motivation       434
happiness        424
work             372
business         363
mindfulness      322
relationships    275
communication    236
science          209
creativity       195
management       187
health           187
money            157
politics         127
marketing        125
education        118
technology       107
biography         89
economics         77
environment       58
religion          39
fiction           20
Name: count, dtype: int64

In [9]:
books_df.dropna(subset=['summaries'], inplace=True)

In [10]:
books_df.drop_duplicates(subset=['summaries'], inplace=True)

In [11]:
books_df = books_df.reset_index(drop=True)

In [12]:
books_df

Unnamed: 0,book_name,summaries,categories
0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,Brave New World,presents a futuristic society engineered perf...,science
4,1984,is the story of a man questioning the system ...,science
...,...,...,...
1222,Better Than Before,breaks down the latest research on how to bre...,work
1223,The Happiness Hypothesis,is the most thorough analysis of how you can ...,work
1224,Rich Dad Poor Dad,"tells the story of a boy with two fathers, on...",work
1225,The Ruthless Elimination Of Hurry,"will teach you how to slow down, relax, and l...",mindfulness


In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

In [14]:
# Step 1: Generate embeddings using BAAI/bge-small-en model from HuggingFace
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en")
model = AutoModel.from_pretrained("BAAI/bge-small-en")

Downloading:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134M [00:00<?, ?B/s]

In [15]:
# Function to generate embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        embeddings = model(**inputs)
    return embeddings.last_hidden_state.mean(dim=1).squeeze().numpy()

# Generate embeddings for each summary
books_df['embedding'] = books_df['summaries'].apply(get_embeddings)

In [17]:
books_df.sample(10)

Unnamed: 0,book_name,summaries,categories,embedding
1144,One Simple Idea,shows you how to turn your ideas into license...,marketing,"[-0.1661913, -0.047558907, -0.057511996, -0.70..."
407,"Good Vibes, Good Life",explores ways to unlock your true potential b...,relationships,"[-0.31454104, -0.032631405, 0.5389788, -0.0352..."
1039,"Jab, Jab, Jab, Right Hook",is a message to everyone who’s not on the soc...,productivity,"[0.0068836473, -0.017536815, -0.011931073, -0...."
427,The Art of Rhetoric,"is an ancient, time-proven reference book tha...",relationships,"[-0.3225811, 0.46354115, 0.15573183, -0.121496..."
125,Against Empathy,explains the problems with society’s obsessio...,science,"[-0.24232997, -0.07629199, 0.3003001, -0.20524..."
1168,The Rebel Rules,shows you how you can run a business by being...,management,"[-0.1415425, -0.18809648, -0.047290917, -0.482..."
1001,A Curious Mind,is an homage to the power of asking questions...,productivity,"[-0.1691366, 0.451072, 0.13538055, -0.16832036..."
622,Four Thousand Weeks,explores the popularized concept of time mana...,happiness,"[-0.32544625, 0.70163804, 0.48712817, -0.25244..."
93,Brief Answers To The Big Questions,tackles some of the universe’s biggest myster...,science,"[-0.31145382, -0.121578135, -0.124654435, -0.1..."
398,A Walk In The Woods,tells the interesting story of the adventures...,environment,"[-0.41684574, 0.0908739, 0.6247539, 0.34636876..."


In [25]:
# Function to find top 5 similar books based on a given title
def get_cosine_simiarity(title, df):
    book_idx = books_df[books_df['book_name'] == title].index[0]
    book_embedding = books_df.loc[book_idx, 'embedding']
    similarities = cosine_similarity([book_embedding], books_df['embedding'].tolist()).flatten()
    return  similarities

### Finding Similar Books

 - The Bitcoin Standard
 - Measure What Matters
 - The Happiness Hypothesis

In [32]:
# Display the titles of the top 5 similar movies for a given movie title
sample_name = 'The Bitcoin Standard'
similarity = get_cosine_simiarity(sample_name,
                                  books_df)

In [33]:
rec_books = books_df.copy()

In [34]:
rec_books['similarity'] = list(similarity)

In [37]:
rec_books.sort_values('similarity', ascending = False)[0:6]

Unnamed: 0,book_name,summaries,categories,embedding,similarity
366,The Bitcoin Standard,uses the history of money and gold to explain...,economics,"[-0.3578873, 0.0131510375, 0.08036052, -0.1801...",1.0
370,The Age Of Cryptocurrency,"explains the past, present, and future of Bit...",economics,"[-0.4667489, -0.21616937, 0.16893215, -0.08813...",0.888518
829,Blockchain Revolution,explains how the power of this new technology...,money,"[-0.44502056, -0.18159947, 0.10160462, -0.2489...",0.845817
371,Digital Gold,"details the beginnings of Bitcoin, including ...",economics,"[-0.5247627, -0.075224645, -0.044920057, -0.45...",0.840757
367,Cryptoassets,is your guide to understanding this revolutio...,economics,"[-0.41580233, -0.2714688, 0.31340843, -0.31226...",0.827831
852,Dollars And Sense,explains why it’s so hard to manage money and...,money,"[-0.048160866, -0.37490886, 0.30639285, 0.0366...",0.813202
