## Loading the dataset

In [15]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.1.0


In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import ast

In [4]:
books_df = pd.read_csv('https://raw.githubusercontent.com/manaranjanp/ISB_MLUL/main/clustering/books_summary.csv.zip', index_col=[0])

In [5]:
books_df.head(10)

Unnamed: 0,book_name,summaries,categories
0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,Brave New World,presents a futuristic society engineered perf...,science
4,1984,is the story of a man questioning the system ...,science
5,Stolen Focus,explains why our attention spans have been dw...,science
6,The Life-Changing Science of Detecting Bullshit,teaches its readers how to avoid falling for ...,science
7,Dopamine Nation,talks about the importance of living a balance...,science
8,The Art of Statistics,is a non-technical book that shows how statis...,science
9,No Self No Problem,is a provocative read about the implications ...,science


In [6]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5201 entries, 0 to 5244
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   book_name   5201 non-null   object
 1   summaries   5194 non-null   object
 2   categories  5201 non-null   object
dtypes: object(3)
memory usage: 162.5+ KB


In [7]:
books_df.shape

(5201, 3)

## Drop the books without summaries

In [11]:
books_df.dropna(subset=['summaries'], inplace=True)

In [13]:
books_df.drop_duplicates(subset=['summaries'], inplace=True)

In [12]:
books_df = books_df.reset_index(drop=True)

In [14]:
books_df

Unnamed: 0,book_name,summaries,categories
0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,Brave New World,presents a futuristic society engineered perf...,science
4,1984,is the story of a man questioning the system ...,science
...,...,...,...
1222,Better Than Before,breaks down the latest research on how to bre...,work
1223,The Happiness Hypothesis,is the most thorough analysis of how you can ...,work
1224,Rich Dad Poor Dad,"tells the story of a boy with two fathers, on...",work
1225,The Ruthless Elimination Of Hurry,"will teach you how to slow down, relax, and l...",mindfulness


## Embedding the summaries

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [18]:
# Step 1: Generate embeddings using BAAI/bge-small-en model from HuggingFace
model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [19]:
# Generate embeddings for each summary
books_df['embedding'] = books_df['summaries'].map(lambda sentence: model.encode(sentence))

In [20]:
books_df.sample(10)

Unnamed: 0,book_name,summaries,categories,embedding
651,Under Pressure,uncovers the hidden anxieties and stresses th...,happiness,"[0.034994278, 0.010860743, 0.12587596, 0.06007..."
722,Irresistible,reveals how alarmingly stuck to our devices w...,happiness,"[-0.042299736, 0.015681317, 0.064593956, -0.01..."
1004,Hatching Twitter,details the story and human drama behind the ...,productivity,"[-0.017622633, -0.03763212, 0.009620747, 0.004..."
458,Good People,is a book about business and leadership which...,relationships,"[-0.042867072, 0.042867426, -0.03860696, -0.03..."
353,The Invincible Company,explores the secrets of a successful company ...,economics,"[-0.03013611, -0.057211988, -0.068299614, -0.0..."
699,Self-Compassion,teaches you the art of being kind to yourself...,happiness,"[-0.008300158, 0.06418497, 0.019911041, 0.0704..."
1166,The 21 Irrefutable Laws Of Leadership,shows you that leadership is learned not inhe...,management,"[-0.00033710475, 0.013579462, 0.043360002, 0.0..."
1034,The Psychology of Winning,"teaches you the 10 qualities of winners, whic...",productivity,"[-0.026224231, 0.07783246, -0.02457525, -0.087..."
437,Keep Showing Up,explores the struggles that married couples f...,relationships,"[-0.034185056, 0.07970856, 0.07080163, 0.11560..."
49,The Emperor Of All Maladies,details the beginnings and progress in our un...,science,"[0.032302193, 0.079289205, 0.011555844, 0.0205..."


## Finding topN similar books

In [21]:
# Function to find top 5 similar books based on a given title
def get_cosine_simiarity(title, df):
    book_idx = books_df[books_df['book_name'] == title].index[0]
    book_embedding = books_df.loc[book_idx, 'embedding']
    similarities = cosine_similarity([book_embedding], books_df['embedding'].tolist()).flatten()
    return  similarities

### Finding Similar Books

 - The Bitcoin Standard
 - Measure What Matters
 - The Happiness Hypothesis

In [26]:
books_df.sample(10)['book_name']

Unnamed: 0,book_name
172,Einstein: His Life And Universe
823,Pioneering Portfolio Management
834,From Here To Financial Happiness
1028,The Success Principles
203,Will
1015,The Art Of Learning
987,How To Read A Book
967,The 5 Second Rule
1093,Do The Work
300,Winners Take All


In [31]:
# Display the titles of the top 5 similar movies for a given movie title
#sample_name = 'Einstein: His Life And Universe'
sample_name = "The Art Of Learning"
similarity = get_cosine_simiarity(sample_name,
                                  books_df)

In [32]:
rec_books = books_df.copy()

In [33]:
rec_books['similarity'] = list(similarity)

In [34]:
rec_books.sort_values('similarity', ascending = False)[0:6]

Unnamed: 0,book_name,summaries,categories,embedding,similarity
1015,The Art Of Learning,explains the science of becoming a top perfor...,productivity,"[0.057849336, -0.02613431, -0.01824605, -0.049...",1.0
192,Bounce,shows you that training trumps talent every t...,science,"[0.07816114, 0.006413545, 0.072669335, -0.0408...",0.627209
1064,Psyched Up,is an in-depth look at the science behind men...,psychology,"[0.09361926, -0.0667208, -0.0047581135, -0.043...",0.5939
985,Mind Gym,explains why the performance of world-class a...,productivity,"[0.08948292, 0.052655842, 0.0183305, -0.077798...",0.581116
994,The Rise Of Superman,"decodes the science of ultimate, human perfor...",productivity,"[0.017936006, 0.04562372, 0.007561816, -0.0560...",0.576935
956,Peak Performance,shows you how to perform at your highest leve...,productivity,"[0.03880304, 0.0336778, 0.0039549246, -0.01320...",0.559047
