In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import ast

In [3]:
books_df = pd.read_csv('https://raw.githubusercontent.com/manaranjanp/ISB_MLUL2/main/cf/books_summary.csv.zip', index_col=[0])

In [4]:
books_df.head(10)

Unnamed: 0,book_name,summaries,categories
0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,Brave New World,presents a futuristic society engineered perf...,science
4,1984,is the story of a man questioning the system ...,science
5,Stolen Focus,explains why our attention spans have been dw...,science
6,The Life-Changing Science of Detecting Bullshit,teaches its readers how to avoid falling for ...,science
7,Dopamine Nation,talks about the importance of living a balance...,science
8,The Art of Statistics,is a non-technical book that shows how statis...,science
9,No Self No Problem,is a provocative read about the implications ...,science


In [5]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5201 entries, 0 to 5244
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   book_name   5201 non-null   object
 1   summaries   5194 non-null   object
 2   categories  5201 non-null   object
dtypes: object(3)
memory usage: 162.5+ KB


In [6]:
books_df.shape

(5201, 3)

In [7]:
books_df.categories.value_counts()

categories
psychology       595
productivity     485
motivation       434
happiness        424
work             372
business         363
mindfulness      322
relationships    275
communication    236
science          209
creativity       195
management       187
health           187
money            157
politics         127
marketing        125
education        118
technology       107
biography         89
economics         77
environment       58
religion          39
fiction           20
Name: count, dtype: int64

In [8]:
books_df.dropna(subset=['summaries'], inplace=True)

In [9]:
books_df.drop_duplicates(subset=['summaries'], inplace=True)

In [10]:
books_df = books_df.reset_index(drop=True)

In [11]:
books_df

Unnamed: 0,book_name,summaries,categories
0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,Brave New World,presents a futuristic society engineered perf...,science
4,1984,is the story of a man questioning the system ...,science
...,...,...,...
1222,Better Than Before,breaks down the latest research on how to bre...,work
1223,The Happiness Hypothesis,is the most thorough analysis of how you can ...,work
1224,Rich Dad Poor Dad,"tells the story of a boy with two fathers, on...",work
1225,The Ruthless Elimination Of Hurry,"will teach you how to slow down, relax, and l...",mindfulness


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# Create tf-idf vectors using the overview column
tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                   min_df = 5,
                                   max_df = 0.8)

In [14]:
tfidf_matrix = tfidf_vectorizer.fit_transform(books_df['summaries'])

In [15]:
tfidf_matrix.shape

(1227, 900)

In [16]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [18]:
books_sim = 1 - pairwise_distances( tfidf_matrix, metric="cosine" )

In [21]:
books_sim_df = pd.DataFrame( books_sim )

In [22]:
books_sim_df.head( 10 )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1217,1218,1219,1220,1221,1222,1223,1224,1225,1226
0,1.0,0.0,0.0,0.0,0.0,0.077582,0.037083,0.0,0.0,0.07213,...,0.0,0.0,0.043648,0.0,0.0,0.0,0.0,0.0,0.105632,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081296,0.0
2,0.0,0.0,1.0,0.0,0.076252,0.078215,0.0,0.023923,0.0,0.0,...,0.0,0.0,0.0,0.07951,0.0,0.0,0.022141,0.16425,0.034685,0.0
3,0.0,0.0,0.0,1.0,0.126309,0.0,0.0,0.0,0.0,0.0,...,0.097214,0.0,0.0,0.0,0.0,0.0,0.226319,0.0,0.0,0.0
4,0.0,0.0,0.076252,0.126309,1.0,0.0,0.0,0.0,0.0,0.0,...,0.140918,0.0,0.0,0.0,0.126487,0.0,0.084076,0.059337,0.0,0.0
5,0.077582,0.0,0.078215,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.123349,0.0,0.195044,0.0,0.0,0.0,0.0,0.100842,0.0
6,0.037083,0.0,0.0,0.0,0.0,0.0,1.0,0.078384,0.157998,0.0,...,0.078421,0.104375,0.079798,0.086492,0.0,0.0,0.169283,0.044592,0.0,0.111226
7,0.0,0.0,0.023923,0.0,0.0,0.0,0.078384,1.0,0.0,0.0,...,0.0,0.0,0.0,0.271572,0.0,0.0,0.020419,0.018616,0.031989,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.157998,0.0,1.0,0.0,...,0.083361,0.110949,0.030791,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.07213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
def get_similar_books( title, topN = 5 ):
    book_idx = books_df[books_df['book_name'] == title].index[0]
    books_df['similarity'] = books_sim_df.iloc[book_idx]
    top_n = books_df.sort_values( ["similarity"], ascending = False )[0:topN+1]
    #print( "Similar Movies to: ", )
    return top_n

### Finding Similar Books

 - The Bitcoin Standard
 - Measure What Matters
 - The Happiness Hypothesis

In [51]:
books_df[books_df.book_name.str.contains("Mindfulness")]

Unnamed: 0,book_name,summaries,categories,similarity
731,The Miracle of Mindfulness,teaches the ancient Buddhist practice of mind...,happiness,0.0


In [53]:
get_similar_books( 'The Miracle of Mindfulness' )

Unnamed: 0,book_name,summaries,categories,similarity
731,The Miracle of Mindfulness,teaches the ancient Buddhist practice of mind...,happiness,1.0
129,Aware,is a comprehensive overview of the far-reachi...,science,0.349732
796,10% Happier,"gives skeptics an easy “in” to meditation, by...",happiness,0.281242
641,Journey of Awakening,explains the basics of meditation using ideas...,happiness,0.273187
797,The Power of Now,shows you that every minute you spend worryin...,happiness,0.268207
546,How Successful People Think,lays out eleven specific ways of thinking you...,relationships,0.256036
