In [6]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
df = pd.read_csv("books_summary.csv")

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,book_name,summaries,categories
0,0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,3,Brave New World,presents a futuristic society engineered perf...,science
4,4,1984,is the story of a man questioning the system ...,science


In [10]:
df.info()
df.shape
df.rename(columns={"Unnamed: 0":"index"}, inplace = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5201 entries, 0 to 5200
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5201 non-null   int64 
 1   book_name   5201 non-null   object
 2   summaries   5194 non-null   object
 3   categories  5201 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.7+ KB


In [11]:
df.isna().sum()

index         0
book_name     0
summaries     7
categories    0
dtype: int64

In [12]:
df.dropna(axis=0,inplace=True)

In [13]:
df["categories"].unique()

array(['science', 'biography', 'politics', 'economics', 'environment',
       'relationships', 'happiness', 'money', 'productivity',
       'psychology', 'motivation', 'marketing', 'management', 'health',
       'business', 'creativity', 'education', 'fiction', 'communication',
       'religion', 'technology', 'work', 'mindfulness'], dtype=object)

In [14]:
combined_features = df['book_name'] + ' ' + df['categories'] + ' ' + df['summaries']
combined_features

0       The Highly Sensitive Person science  is a self...
1       Why Has Nobody Told Me This Before? science  i...
2       The Midnight Library science  tells the story ...
3       Brave New World science  presents a futuristic...
4       1984 science  is the story of a man questionin...
                              ...                        
5196    Essentialism mindfulness  will show you a new,...
5197    The Art Of Happiness mindfulness  is the resul...
5198    The Paradox Of Choice mindfulness  shows you h...
5199    Stumbling On Happiness mindfulness  examines t...
5200    The 7 Habits Of Highly Effective People mindfu...
Length: 5194, dtype: object

In [15]:
combined_features.duplicated().sum()

np.int64(222)

In [16]:
combined_features.drop_duplicates(inplace=True)

In [17]:
combined_features.shape

(4972,)

In [18]:
vectorizer = TfidfVectorizer(analyzer="char_wb")
# CountVectorizer
feature_vectors = vectorizer.fit_transform(combined_features)

In [19]:
print(feature_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 129343 stored elements and shape (4972, 61)>
  Coords	Values
  (0, 0)	0.7956529047165014
  (0, 46)	0.16877485857622757
  (0, 34)	0.09650097372196471
  (0, 31)	0.361733143343864
  (0, 35)	0.12057771444795469
  (0, 33)	0.024437739587322427
  (0, 38)	0.13282231529311528
  (0, 51)	0.03675996208976262
  (0, 45)	0.20502333771356776
  (0, 40)	0.13263548589275015
  (0, 48)	0.039925642303285304
  (0, 42)	0.07362349882660389
  (0, 44)	0.13260881745275024
  (0, 41)	0.16877485857622757
  (0, 29)	0.04847416214914716
  (0, 27)	0.13260881745275024
  (0, 32)	0.08611895379585822
  (0, 11)	0.08012099217952393
  (0, 39)	0.0610820638902638
  (0, 47)	0.060641460412211474
  (0, 30)	0.060775747877832714
  (0, 49)	0.03757490064899096
  (0, 10)	0.06009004094830902
  (0, 43)	0.03745053956449216
  (0, 12)	0.012233622412725798
  :	:
  (4971, 46)	0.10864040710882611
  (4971, 34)	0.1304471580423
  (4971, 31)	0.24992318731806118
  (4971, 35)	0.16299338303

In [48]:
similarity = cosine_similarity(feature_vectors, feature_vectors)
similarity[-1]

array([0.9696033 , 0.9691096 , 0.97568329, ..., 0.98144978, 0.97740486,
       1.        ], shape=(4972,))

In [21]:
list_of_all_titles = df['book_name'].tolist()
print(list_of_all_titles)

['The Highly Sensitive Person', 'Why Has Nobody Told Me This Before?', 'The Midnight Library', 'Brave New World', '1984', 'Stolen Focus', 'The Life-Changing Science of Detecting Bullshit', 'Dopamine Nation', 'The Art of Statistics', 'No Self No Problem', 'The Highly Sensitive Person', 'Why Has Nobody Told Me This Before?', 'The Midnight Library', 'Brave New World', '1984', 'Stolen Focus', 'The Life-Changing Science of Detecting Bullshit', 'Dopamine Nation', 'The Art of Statistics', 'No Self No Problem', 'The Code Breaker', 'Super Human', 'This Is Your Mind On Plants', 'The Undoing Project', 'The Dawn of Everything', 'One Decision', 'Napoleon’s Buttons', 'The Sovereign Individual', 'Keto Answers', 'Joyful', 'Invisible Women', 'The Worldly Philosophers', 'Brain Maker', 'Tesla: Man Out of Time', 'The God Equation', 'Astrophysics for People in a Hurry', 'Lifespan', 'The Inner Life of Animals', 'Chaos', 'Unlimited Memory', 'Wintering', 'The Shallows', 'Rationality', 'The High 5 Habit', 'Why

In [22]:
book_name = "My Age Of Anxiety"  # input(' Enter your favourite movie name : ') #My Age Of Anxiety

In [34]:
find_close_match = difflib.get_close_matches(book_name, list_of_all_titles, n=10, cutoff=0.5)
print(find_close_match)

['My Age Of Anxiety', 'My Age Of Anxiety', 'My Age Of Anxiety', 'Age Of Anger', 'Age Of Ambition', 'My Stroke Of Insight', 'My Stroke Of Insight', 'My Stroke Of Insight', 'Status Anxiety', 'Status Anxiety']


In [43]:
close_match = find_close_match[0]
index_of_the_movie = df[df.book_name == close_match].index.values[0]
index_of_the_movie

np.int64(91)

In [49]:
# getting a list of similar movies
similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, np.float64(0.9602040512702652)), (1, np.float64(0.9711593621593826)), (2, np.float64(0.9665542143165338)), (3, np.float64(0.9553743054829499)), (4, np.float64(0.9741361226612786)), (5, np.float64(0.9802182257916439)), (6, np.float64(0.9629789540157608)), (7, np.float64(0.9793483696139812)), (8, np.float64(0.9738689553542677)), (9, np.float64(0.974857543438934)), (10, np.float64(0.9607248780215434)), (11, np.float64(0.9713998853920891)), (12, np.float64(0.972292564509218)), (13, np.float64(0.9737816735570329)), (14, np.float64(0.978565069814417)), (15, np.float64(0.9696850115728886)), (16, np.float64(0.9815434966848863)), (17, np.float64(0.9575061850906628)), (18, np.float64(0.9768863722907805)), (19, np.float64(0.9831235729980154)), (20, np.float64(0.9839812914516517)), (21, np.float64(0.9825656344281049)), (22, np.float64(0.981939816187661)), (23, np.float64(0.9715035941351909)), (24, np.float64(0.9617887052267892)), (25, np.float64(0.9786098259603921)), (26, np.float64(0.9700264

In [26]:
# sorting the movies based on their similarity score
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(91, np.float64(1.0000000000000002)), (2608, np.float64(0.996661886654547)), (3089, np.float64(0.9931327117375354)), (3569, np.float64(0.9930149805874278)), (2252, np.float64(0.992953788268368)), (175, np.float64(0.9923399389414973)), (4600, np.float64(0.9923332003470877)), (3366, np.float64(0.9909521376178744)), (4387, np.float64(0.990535186826203)), (3625, np.float64(0.9902775410662047)), (4390, np.float64(0.9901809830795388)), (1747, np.float64(0.9901321380415952)), (2310, np.float64(0.9900242470140959)), (4165, np.float64(0.9899776415165213)), (1223, np.float64(0.9899637175942535)), (3599, np.float64(0.989937371817743)), (2320, np.float64(0.9898496039128608)), (2542, np.float64(0.9897049183462697)), (3371, np.float64(0.9895862306959946)), (2844, np.float64(0.9893735146689403)), (4027, np.float64(0.9892514010109994)), (3397, np.float64(0.9892482688414791)), (4152, np.float64(0.9892123977608624)), (3363, np.float64(0.9891637472396638)), (4630, np.float64(0.9889893595012941)), (4237,

In [27]:
top_sim = sorted_similar_movies[:5]
top_sim

[(91, np.float64(1.0000000000000002)),
 (2608, np.float64(0.996661886654547)),
 (3089, np.float64(0.9931327117375354)),
 (3569, np.float64(0.9930149805874278)),
 (2252, np.float64(0.992953788268368))]

In [None]:

for i, book in enumerate(top_sim):
    index = book[0]
    book_name_from_index = df.loc[index, 'book_name'] if index in df.index else "no similar book available"
    print(i, "-", book_name_from_index)


for i, movie in enumerate(sorted_similar_movies[:6]):  # Limit to the first 6 iterations directly
    index = movie[0]
    title_from_index = df.loc[index, 'book_name'] if index in df.index else "Title Not Found"
    print(i, '-', title_from_index)


0 - My Age Of Anxiety
1 - How to Take Smart Notes
2 - Effortless
3 - The Hero Factor
4 - The Four Tendencies
5 - The Botany Of Desire
