In [1]:
!pip install googlesearch-python





[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import time
import random
import numpy as np
from googlesearch import search
import pandas as pd
from typing import Optional
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def fetch_first_google_link_with_backoff(query: str, max_retries: int = 3) -> Optional[str]:
    """
    Fetch first Google search result with exponential backoff and retry logic
    """
    for attempt in range(max_retries + 1):
        try:
            # Add random delay between 1-5 seconds
            base_delay = random.uniform(1, 5)
            time.sleep(base_delay)
            
            results = search(query, num_results=1, lang="en")
            url = list(results)[0]
            
            if url.startswith("https://books.google.com"):
                return url
            else:
                # Additional delay before backup search
                time.sleep(random.uniform(2, 4))
                results = search(query.replace("google", "amazon"), num_results=1, lang="en")
                return list(results)[0]
                
        except Exception as e:
            if "429" in str(e) or "Too Many Requests" in str(e):
                if attempt < max_retries:
                    # Exponential backoff: 2^attempt * base_time + random jitter
                    wait_time = (2 ** attempt) * 10 + random.uniform(5, 15)
                    logger.warning(f"Rate limited on attempt {attempt + 1}. Waiting {wait_time:.1f} seconds...")
                    time.sleep(wait_time)
                    continue
                else:
                    logger.error(f"Max retries exceeded for query: {query}")
                    return None
            else:
                logger.error(f"Unexpected error for query '{query}': {e}")
                return None
    
    return None

def process_queries_in_batches(queries_df: pd.DataFrame, batch_size: int = 50, 
                              batch_delay: int = 300) -> pd.Series:
    """
    Process queries in batches with delays between batches
    """
    results = []
    total_queries = len(queries_df)
    
    for i in range(0, total_queries, batch_size):
        batch_end = min(i + batch_size, total_queries)
        batch_queries = queries_df.iloc[i:batch_end]
        
        logger.info(f"Processing batch {i//batch_size + 1}: queries {i+1}-{batch_end} of {total_queries}")
        
        # Process batch
        batch_results = batch_queries.apply(fetch_first_google_link_with_backoff)
        results.extend(batch_results.tolist())
        
        # Delay between batches (except for the last batch)
        if batch_end < total_queries:
            logger.info(f"Batch complete. Waiting {batch_delay} seconds before next batch...")
            time.sleep(batch_delay)
    
    return pd.Series(results, index=queries_df.index)

# Alternative approach: Save progress incrementally
def process_queries_with_checkpoints(queries_df: pd.DataFrame, 
                                   checkpoint_file: str = "search_progress.csv",
                                   start_index: int = 0) -> pd.Series:
    """
    Process queries with periodic checkpoints to resume if interrupted
    """
    results = [None] * len(queries_df)
    
    # Load existing progress if checkpoint exists
    try:
        checkpoint_df = pd.read_csv(checkpoint_file)
        for idx, row in checkpoint_df.iterrows():
            if row['result'] is not np.nan:
                results[row['query_index']] = row['result']
        logger.info(f"Loaded {len(checkpoint_df)} previous results from checkpoint")
    except FileNotFoundError:
        logger.info("No checkpoint file found, starting fresh")
    
    for i in range(start_index, len(queries_df)):
        if results[i] is not None:  # Skip if already processed
            continue
            
        query = queries_df.iloc[i]
        logger.info(f"Processing query {i+1}/{len(queries_df)}: {query}")
        
        result = fetch_first_google_link_with_backoff(query)
        results[i] = result
        
        # Save checkpoint every 10 queries
        if (i + 1) % 10 == 0:
            checkpoint_data = {
                'query_index': range(len(results)),
                'query': queries_df.tolist(),
                'result': results
            }
            pd.DataFrame(checkpoint_data).to_csv(checkpoint_file, index=False)
            logger.info(f"Checkpoint saved at query {i+1}")
    
    return pd.Series(results, index=queries_df.index)



In [3]:
df = pd.read_csv("books_cleaned.csv", encoding="utf-8")

In [4]:
queries_df = df["title_and_subtitle"] + " by " + df["authors"] + "- google books"

In [None]:
process_queries_with_checkpoints(queries_df)

INFO:__main__:Loaded 6397 previous results from checkpoint
INFO:__main__:Processing query 807/6397: City of God  by Augustine;Henry Scowcroft Bettenson;Gillian Rosemary Evans- google books
INFO:__main__:Processing query 1618/6397: The Complete Stories of Evelyn Waugh  by Evelyn Waugh- google books
INFO:__main__:Processing query 2355/6397: Hawthorne's Short Stories  by Nathaniel Hawthorne- google books
INFO:__main__:Processing query 5781/6397: Little Butterfly  by Hinako Takanaga;Sachiko Sato- google books
INFO:__main__:Processing query 6391/6397: Aspects of the Novel  by E. M. Forster- google books
INFO:__main__:Processing query 6392/6397: Mistaken Identity  by Nayantara Sahgal- google books
INFO:__main__:Processing query 6393/6397: Journey to the East  by Hermann Hesse- google books
INFO:__main__:Processing query 6394/6397: The Monk Who Sold His Ferrari: A Fable About Fulfilling Your Dreams & Reaching Your Destiny  by Robin Sharma- google books
INFO:__main__:Processing query 6395/6397

In [5]:
queries_df = pd.concat([queries_df,pd.read_csv("search_progress.csv")["result"]], axis=1)

In [6]:
queries_df.columns = ["title", "url"]

In [18]:
unfinished = queries_df[(queries_df.isnull().any(axis=1)) | ~((queries_df["url"].str.contains("amazon", na=False)) | (queries_df["url"].str.contains("google", na=False)))]

In [19]:
unfinished

Unnamed: 0,title,url
73,I Can Read with Me Eyes Shut! by Dr. Seuss- g...,/search?num=3
101,Tyranny of the Majority Funamental Fairness in...,/search?num=3
126,Mars and Venus Book of Days 365 Inspriations t...,/search?num=3
128,Today I Feel Silly & Other Moods That Make My ...,/search?num=3
314,DREAM & THE UNDERWOR by James Hillman- google...,/search?num=3
...,...,...
6392,Journey to the East by Hermann Hesse- google ...,
6393,The Monk Who Sold His Ferrari: A Fable About F...,
6394,I Am that Talks with Sri Nisargadatta Maharaj ...,
6395,The Berlin Phenomenology by Georg Wilhelm Fri...,


In [20]:
unfinished_urls

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [None]:
fetch_first_google_link_with_backoff(unfinished["title"].tolist()[0])

'/search?num=3'

In [21]:
unfinished["hhh"] = unfinished_urls

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unfinished["hhh"] = unfinished_urls


In [22]:
unfinished.head()

Unnamed: 0,title,url,hhh
73,I Can Read with Me Eyes Shut! by Dr. Seuss- g...,/search?num=3,
101,Tyranny of the Majority Funamental Fairness in...,/search?num=3,
126,Mars and Venus Book of Days 365 Inspriations t...,/search?num=3,
128,Today I Feel Silly & Other Moods That Make My ...,/search?num=3,
314,DREAM & THE UNDERWOR by James Hillman- google...,/search?num=3,


In [23]:
df1 = pd.read_csv("search_progress1.csv")

In [30]:
df1["url"][df1["url"].isna()]

806    NaN
Name: url, dtype: object

In [31]:
df.head()

Unnamed: 0,isbn13,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006163831,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0,The One Tree,9780006163831 Volume Two of Stephen Donaldson'...
3,9780006178736,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
4,9780006280897,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...


Unnamed: 0,title,url
0,Gilead by Marilynne Robinson- google books,https://books.google.com/books/about/Gilead.ht...
1,Spider's Web A Novel by Charles Osborne;Agatha...,https://books.google.com/books/about/Spider_s_...
2,The One Tree by Stephen R. Donaldson- google ...,https://books.google.com/books/about/The_One_T...
3,Rage of angels by Sidney Sheldon- google books,https://books.google.com/books/about/Rage_of_A...
4,The Four Loves by Clive Staples Lewis- google...,https://books.google.com/books/about/The_Four_...


In [33]:
df1.head()

Unnamed: 0,title,url
0,Gilead by Marilynne Robinson- google books,https://books.google.com/books/about/Gilead.ht...
1,Spider's Web A Novel by Charles Osborne;Agatha...,https://books.google.com/books/about/Spider_s_...
2,The One Tree by Stephen R. Donaldson- google ...,https://books.google.com/books/about/The_One_T...
3,Rage of angels by Sidney Sheldon- google books,https://books.google.com/books/about/Rage_of_A...
4,The Four Loves by Clive Staples Lewis- google...,https://books.google.com/books/about/The_Four_...


In [35]:
queries_df[(queries_df.isnull().any(axis=1)) | ~((queries_df["url"].str.contains("amazon", na=False)) | (queries_df["url"].str.contains("google", na=False)))]

Unnamed: 0,title,url
73,I Can Read with Me Eyes Shut! by Dr. Seuss- g...,/search?num=3
101,Tyranny of the Majority Funamental Fairness in...,/search?num=3
126,Mars and Venus Book of Days 365 Inspriations t...,/search?num=3
128,Today I Feel Silly & Other Moods That Make My ...,/search?num=3
314,DREAM & THE UNDERWOR by James Hillman- google...,/search?num=3
...,...,...
6392,Journey to the East by Hermann Hesse- google ...,
6393,The Monk Who Sold His Ferrari: A Fable About F...,
6394,I Am that Talks with Sri Nisargadatta Maharaj ...,
6395,The Berlin Phenomenology by Georg Wilhelm Fri...,


In [None]:
# i'll drop dataframes without a good url

In [39]:
to_drop = df1[(df1.isnull().any(axis=1)) | ~((df1["url"].str.contains("amazon", na=False)) | (df1["url"].str.contains("google", na=False)))].index
to_drop

Index([ 768,  806, 1170, 1269, 1311, 1343, 2311, 2389, 2536, 3270, 3572, 4228,
       4941, 5292, 5293, 6085],
      dtype='int64')

In [40]:
df1 = df1.drop(index=to_drop)

In [41]:
df1

Unnamed: 0,title,url
0,Gilead by Marilynne Robinson- google books,https://books.google.com/books/about/Gilead.ht...
1,Spider's Web A Novel by Charles Osborne;Agatha...,https://books.google.com/books/about/Spider_s_...
2,The One Tree by Stephen R. Donaldson- google ...,https://books.google.com/books/about/The_One_T...
3,Rage of angels by Sidney Sheldon- google books,https://books.google.com/books/about/Rage_of_A...
4,The Four Loves by Clive Staples Lewis- google...,https://books.google.com/books/about/The_Four_...
...,...,...
6392,Journey to the East by Hermann Hesse- google ...,https://books.google.com/books/about/The_Journ...
6393,The Monk Who Sold His Ferrari: A Fable About F...,https://books.google.com/books/about/The_Monk_...
6394,I Am that Talks with Sri Nisargadatta Maharaj ...,https://books.google.com/books/about/I_Am_that...
6395,The Berlin Phenomenology by Georg Wilhelm Fri...,https://books.google.com/books/about/The_Berli...


In [46]:
with open("to_drop.txt","w") as f:
    f.write("\n".join(to_drop.astype(str).tolist()))

In [48]:
df1.to_csv("books_with_urls.csv", index=False)