## Dataset: scraping books

In [9]:
import pandas as pd
import requests as r
import time

### Import dataset

In [10]:
df = pd.read_parquet("data/ready/amazon_purchases.parquet")

### Data cleaning


In [11]:
# Filter by ISBN with 10 and 13 digits and Category Abis book
df = df[
    (df["Category"] == "Abis book")
    & (df["ASIN/ISBN (Product Code)"].str.len().isin([10, 13]))
]

In [12]:
df["ASIN/ISBN (Product Code)"].nunique()

56455

In [13]:
# get unique ISBNs list
isbn_list = df["ASIN/ISBN (Product Code)"].unique().tolist()

### ISBN Search - Google Books API

In [14]:
def get_book_info_from_google(isbn):
    response = r.get(f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}")
    if response.status_code == 200:
        data = response.json()
        if "items" in data:
            book_info = data["items"][0]["volumeInfo"]
            return {
                "isbn": isbn,
                "title": book_info.get("title", "Title not available"),
                "authors": book_info.get("authors", ["Authors not available"]),
                "categories": book_info.get("categories", ["Categories not available"]),
                "description": book_info.get(
                    "description", ["Description not available"]
                ),
            }
        else:
            return "Book not found"
    else:
        return f"Error accessing Google Books API: {response.status_code}"

In [15]:
# estimating total time for ISBN search

import time
import random

# subset of ISBNs for testing (e.g., 10 ISBNs)
test_isbn_list = random.sample(isbn_list, 10)

start_time = time.time()

# run the code for the subset
book_info_list = [get_book_info_from_google(isbn) for isbn in test_isbn_list]

end_time = time.time()

# average time/request
average_time_per_request = (end_time - start_time) / len(test_isbn_list)
print(f"Tempo médio por requisição: {average_time_per_request:.2f} segundos")

# estimated total time for all ISBNs
estimated_total_time = average_time_per_request * len(isbn_list)
print(f"Tempo estimado total: {estimated_total_time / 60:.2f} minutos")

Tempo médio por requisição: 0.60 segundos
Tempo estimado total: 568.06 minutos


In [16]:
# run the code for the entire dataset
book_info_list = [get_book_info_from_google(isbn) for isbn in isbn_list]

In [17]:
# filter only items that are dictionaries
filtered_book_info_list = [book for book in book_info_list if isinstance(book, dict)]

# convert to df
df = pd.DataFrame(filtered_book_info_list)

# save parquet file
df.to_parquet("data/raw/book_info.parquet", engine="pyarrow", index=False)