In [9]:
import os 
import requests
import pandas as pd
import numpy as np
import thefuzz as fuzz
import re
import string

from utils import check_to_run_initial_data_load, pull_from_google_books, create_library
from utils import titles_l # Input data
from utils import authors_l # Input data

MATCH_SCORE = 70
LAST_N_BOOKS = 10
TERMS_IN_SEARCH_QUERY = 7

final_books_df = pd.read_csv('library.csv')
final_books_df = final_books_df.tail(LAST_N_BOOKS)

final_books_df['description'] = final_books_df['description'].astype(str).apply(
    lambda x: re.sub(f"[{re.escape(string.punctuation)}]", "", x)
)

final_books_df.head()


Unnamed: 0.1,Unnamed: 0,title,subtitle,authors,pulishedDate,pageCount,categories,description,full_title
3,0,SUMMARY - Bullshit Jobs: A Theory By David Gra...,,['Shortcut Edition'],2021-06-17,24,['Business & Economics'],Our summary is short simple and pragmatic It ...,SUMMARY - Bullshit Jobs: A Theory By David Gra...
4,0,The Science of Self-Learning,"How to Teach Yourself Anything, Learn More in ...",['Peter Hollins'],2019-10-22,202,['Education'],How to learn effectively when you have to be b...,The Science of Self-Learning How to Teach Your...
5,0,Zen Golf,Mastering the Mental Game,['Joseph Parent'],2002-06-18,226,['Sports & Recreation'],A highly original and groundbreaking book from...,Zen Golf Mastering the Mental Game
6,0,Mighty Numbers,,['Marvel Press Book Group'],2016-05-03,0,['Juvenile Fiction'],Young children will learn to count from one to...,Mighty Numbers
7,0,Happier Hour,"How to Beat Distraction, Expand Your Time, and...",['Cassie Holmes'],2023-06-20,320,['Biography & Autobiography'],We live in a culture where most of us suffer f...,"Happier Hour How to Beat Distraction, Expand Y..."


In [None]:
# %pip install -r requirements.txt

Collecting thefuzz (from -r requirements.txt (line 4))
  Using cached thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pyarrow (from -r requirements.txt (line 5))
  Using cached pyarrow-19.0.1-cp313-cp313-win_amd64.whl.metadata (3.4 kB)
Collecting scikit-learn (from -r requirements.txt (line 7))
  Using cached scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting rake-nltk (from -r requirements.txt (line 8))
  Using cached rake_nltk-1.0.6-py3-none-any.whl.metadata (6.4 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz->-r requirements.txt (line 4))
  Downloading rapidfuzz-3.13.0-cp313-cp313-win_amd64.whl.metadata (12 kB)
Collecting scipy>=1.6.0 (from scikit-learn->-r requirements.txt (line 7))
  Downloading scipy-1.15.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn->-r requirements.txt (line 7))
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->-r r


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## TFIDF as a method to generate Search Query - GOING WITH THIS APPROACH

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(final_books_df['description'])

# Get feature names and sum TF-IDF scores across all documents
feature_names = tfidf.get_feature_names_out()
tfidf_scores = X.sum(axis=0).A1  # Flatten the matrix to 1D array

# Get indices of top 7 keywords
top_indices = tfidf_scores.argsort()[-TERMS_IN_SEARCH_QUERY:][::-1]
top_keywords = [feature_names[i] for i in top_indices]

print("Top keywords in the whole dataset:", top_keywords)
tfidf_search_query = " ".join(top_keywords)
print(tfidf_search_query)


Top keywords in the whole dataset: ['business', 'humor', 'time', 'book', 'golf', 'jobs', 'life']
business humor time book golf jobs life


##  Rake

In [10]:
from rake_nltk import Rake

corpus = " ".join(final_books_df['description'].astype(str))

rake = Rake(max_length=2)
rake.extract_keywords_from_text(corpus)
keywords = rake.get_ranked_phrases()[:TERMS_IN_SEARCH_QUERY]

rake_search_query = " ".join(keywords)
print(rake_search_query)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\matt/nltk_data'
    - 'c:\\Users\\matt\\AppData\\Local\\Programs\\Python\\Python313\\nltk_data'
    - 'c:\\Users\\matt\\AppData\\Local\\Programs\\Python\\Python313\\share\\nltk_data'
    - 'c:\\Users\\matt\\AppData\\Local\\Programs\\Python\\Python313\\lib\\nltk_data'
    - 'C:\\Users\\matt\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


## Count Vectorizer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = " ".join(final_books_df['description'].astype(str))

vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform([corpus])
sum_words = X.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
sorted_keywords = sorted(words_freq, key=lambda x: x[1], reverse=True)

top_keywords = [word for word, freq in sorted_keywords[:TERMS_IN_SEARCH_QUERY]]
count_search_query = " ".join(top_keywords)

top_keywords

['book', 'business', 'humor', 'life', 'golf', 'time', 'authors']

## SpaCy - For the life of me I cannot get this to install

## KeyBERT (BERT w/ cosine similarity)

In [28]:
# %pip install keybert
from keybert import KeyBERT

corpus = " ".join(final_books_df['description'].astype(str))
kw_model = KeyBERT()
bert_keywords = kw_model.extract_keywords(corpus,
                                          keyphrase_ngram_range=(1, 2),
                                          top_n=TERMS_IN_SEARCH_QUERY)

bert_keywords_ = [word for word, freq in bert_keywords[:TERMS_IN_SEARCH_QUERY]]
bert_search_query = " ".join(bert_keywords_)
bert_search_query

'work smarter dumb jobs stupid jobs jobs useless jobs prove jobs consequences consequences jobs'

## TextRank

In [40]:
# %pip install summa
from summa import keywords

corpus = " ".join(final_books_df['description'].astype(str))
summa_extracted_keywords = keywords.keywords(corpus, split=True)
summa_extracted_keywords[:TERMS_IN_SEARCH_QUERY]

['generational',
 'generation',
 'author',
 'authors',
 'work',
 'worked',
 'working']