In [38]:
import os 
import requests
import pandas as pd
import numpy as np
import thefuzz as fuzz
import re
import string

from utils import check_to_run_initial_data_load, pull_from_google_books, create_library
from utils import titles_l # Input data
from utils import authors_l # Input data

MATCH_SCORE = 70
LAST_N_BOOKS = 10
TERMS_IN_SEARCH_QUERY = 7

final_books_df = pd.read_csv('library.csv')
final_books_df = final_books_df.tail(LAST_N_BOOKS)

final_books_df['description'] = final_books_df['description'].astype(str).apply(
    lambda x: re.sub(f"[{re.escape(string.punctuation)}]", "", x)
)

final_books_df.head()


Unnamed: 0.1,Unnamed: 0,title,subtitle,authors,pulishedDate,pageCount,categories,description,full_title
3,0,SUMMARY - Bullshit Jobs: A Theory By David Gra...,,['Shortcut Edition'],2021-06-17,24,['Business & Economics'],Our summary is short simple and pragmatic It ...,SUMMARY - Bullshit Jobs: A Theory By David Gra...
4,0,The Science of Self-Learning,"How to Teach Yourself Anything, Learn More in ...",['Peter Hollins'],2019-10-22,202,['Education'],How to learn effectively when you have to be b...,The Science of Self-Learning How to Teach Your...
5,0,Zen Golf,Mastering the Mental Game,['Joseph Parent'],2002-06-18,226,['Sports & Recreation'],A highly original and groundbreaking book from...,Zen Golf Mastering the Mental Game
6,0,Mighty Numbers,,['Marvel Press Book Group'],2016-05-03,0,['Juvenile Fiction'],Young children will learn to count from one to...,Mighty Numbers
7,0,Happier Hour,"How to Beat Distraction, Expand Your Time, and...",['Cassie Holmes'],2023-06-20,320,['Biography & Autobiography'],We live in a culture where most of us suffer f...,"Happier Hour How to Beat Distraction, Expand Y..."


## TFIDF as a method to generate Search Query

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(final_books_df['description'])

# Get feature names and sum TF-IDF scores across all documents
feature_names = tfidf.get_feature_names_out()
tfidf_scores = X.sum(axis=0).A1  # Flatten the matrix to 1D array

# Get indices of top 7 keywords
top_indices = tfidf_scores.argsort()[-TERMS_IN_SEARCH_QUERY:][::-1]
top_keywords = [feature_names[i] for i in top_indices]

print("Top keywords in the whole dataset:", top_keywords)
tfidf_search_query = " ".join(top_keywords)
print(tfidf_search_query)


Top keywords in the whole dataset: ['business', 'humor', 'time', 'book', 'golf', 'jobs', 'life']
business humor time book golf jobs life


##  Rake

In [45]:
from rake_nltk import Rake

corpus = " ".join(final_books_df['description'].astype(str))

rake = Rake(max_length=2)
rake.extract_keywords_from_text(corpus)
keywords = rake.get_ranked_phrases()[:TERMS_IN_SEARCH_QUERY]

rake_search_query = " ".join(keywords)
print(rake_search_query)

zone ” writing draws would pave world ran world ran whatever direction wealthiest country


In [47]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = " ".join(final_books_df['description'].astype(str))

vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words='english')
X = vectorizer.fit_transform([corpus])
sum_words = X.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
sorted_keywords = sorted(words_freq, key=lambda x: x[1], reverse=True)

sorted_keywords

[('book', 14),
 ('business', 13),
 ('humor', 12),
 ('life', 9),
 ('golf', 8),
 ('time', 8),
 ('authors', 8),
 ('work', 7),
 ('generational', 7),
 ('summary', 6),
 ('learn', 6),
 ('learning', 6),
 ('history', 6),
 ('supply', 6),
 ('chain', 6),
 ('simple', 5),
 ('jobs', 5),
 ('author', 5),
 ('mental', 5),
 ('new', 5),
 ('years', 5),
 ('lives', 5),
 ('zen', 5),
 ('global', 5),
 ('job', 4),
 ('popular', 4),
 ('information', 4),
 ('research', 4),
 ('complex', 4),
 ('human', 4),
 ('like', 4),
 ('game', 4),
 ('golfers', 4),
 ('feel', 4),
 ('clear', 4),
 ('including', 4),
 ('spend', 4),
 ('theory', 4),
 ('today', 4),
 ('times', 4),
 ('world', 4),
 ('stupid', 3),
 ('david', 3),
 ('bestselling', 3),
 ('american', 3),
 ('loss', 3),
 ('selflearning', 3),
 ('methods', 3),
 ('unlock', 3),
 ('topics', 3),
 ('approach', 3),
 ('help', 3),
 ('people', 3),
 ('pga', 3),
 ('buddhist', 3),
 ('lessons', 3),
 ('dr', 3),
 ('parent', 3),
 ('working', 3),
 ('important', 3),
 ('thought', 3),
 ('america', 3),
 ('p