In [1]:
import requests
import pandas as pd

import time

import os
from dotenv import load_dotenv

load_dotenv()

google_key = os.getenv("GOOGLE_BOOKS")
nyt_key = os.getenv("NYT")

In [2]:
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

In [3]:
df_cached = pd.read_csv('df_cached.csv')
df_cached['primary_isbn13'] = df_cached['primary_isbn13'].apply(str)

In [4]:
df_cached

Unnamed: 0.1,age_group,amazon_product_url,article_chapter_link,asterisk,author,book_image,book_image_height,book_image_width,book_review_link,book_uri,...,publisher,rank,rank_last_week,sunday_review_link,title,updated_date,weeks_on_list,category,published_date,Unnamed: 0
0,,https://www.amazon.com/dp/B0DTYKCJC9?tag=thene...,,0,Jake Tapper and Alex Thompson,https://static01.nyt.com/bestsellers/images/97...,500,329,,nyt://book/786e31c1-bc83-50f1-bd8d-995926458641,...,Penguin Press,1,0,,ORIGINAL SIN,2025-06-02T23:13:46.798Z,1,Political Science,2025-06-08,
1,,https://www.amazon.com/dp/1668023369?tag=thene...,,0,Dawn Staley,https://static01.nyt.com/bestsellers/images/97...,500,329,,nyt://book/93e1c5cf-1a3f-5ede-b081-ca9bbd7d30a2,...,Atria/Black Privilege,2,0,,UNCOMMON FAVOR,2025-06-02T23:13:46.873Z,1,Biography & Autobiography,2025-06-08,
2,,https://www.amazon.com/dp/0525561722?tag=thene...,,0,Ron Chernow,https://static01.nyt.com/bestsellers/images/97...,500,329,,nyt://book/b92f68c9-76ad-5510-8520-c5864d663b19,...,Penguin Press,3,1,,MARK TWAIN,2025-06-02T23:13:46.947Z,2,Biography & Autobiography,2025-06-08,
3,,https://www.amazon.com/dp/1668053373?tag=thene...,,0,Patrick McGee,https://static01.nyt.com/bestsellers/images/97...,500,331,,nyt://book/82ca74aa-bab8-5b05-9a1f-f8f7e383e1e2,...,Scribner,4,0,,APPLE IN CHINA,2025-06-02T23:13:47.02Z,1,Business & Economics,2025-06-08,
4,,https://www.amazon.com/dp/0593655036?tag=thene...,,0,Jonathan Haidt,https://static01.nyt.com/bestsellers/images/97...,500,329,,nyt://book/7557cf43-7888-5c15-8206-d3541cccd89b,...,Penguin Press,5,2,,THE ANXIOUS GENERATION,2025-06-02T23:13:47.093Z,61,Psychology,2025-06-08,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,,http://www.amazon.com/Fifty-Shades-Trilogy-Bun...,,0,E. L. James,https://static01.nyt.com/bestsellers/images/97...,197,128,,nyt://book/dd89aefc-ae79-5850-ac96-f940ba2cc4fb,...,Vintage,16,0,,FIFTY SHADES TRILOGY,2025-05-14T21:54:40.666Z,0,,2012-08-12,
3036,,http://www.amazon.com/The-Marriage-Bargain-Bil...,,0,Jennifer Probst,https://static01.nyt.com/bestsellers/images/A0...,203,128,,nyt://book/cac1eae6-106b-5a28-88cf-73cdf4dd4471,...,Entangled Publishing,17,0,,THE MARRIAGE BARGAIN,2025-05-14T06:55:56.627Z,0,,2012-08-12,
3037,,http://www.amazon.com/The-Litigators-Novel-Joh...,,0,John Grisham,https://static01.nyt.com/bestsellers/images/97...,229,128,,nyt://book/7c934be9-85db-5ab2-b6a8-89c07600fa24,...,Random House Publishing,18,0,,THE LITIGATORS,2025-05-14T06:55:56.769Z,0,,2012-08-12,
3038,,http://www.amazon.com/Backfire-FBI-Thriller-Ca...,,0,Catherine Coulter,https://static01.nyt.com/bestsellers/images/97...,495,327,,nyt://book/c3da01f5-1a39-5460-a8b5-ae4e2e5015bf,...,Penguin Group,19,0,,BACKFIRE,2025-05-14T06:55:56.698Z,0,,2012-08-12,


<!-- This is kind of interesting, but it doesn't show change over time. I've decided to use the NYT best sellers list, specifically non-fiction becuase I'm able to crossreference more specific categories from the Google Books API. The question becomes:

**How have American tastes in non-fiction changed over time?** -->

In [5]:
# Able to calculate date as loop through calls. Neet to remember to always call strftime.
from datetime import datetime
from datetime import timedelta, date
import dateutil.relativedelta


date = datetime.now()
print(date.strftime("%Y-%m-%d"))

while (date.weekday() != 6):
    date = date - timedelta(1)

print(date)
pub_date = date.strftime("%Y-%m-%d")
print(f"pubdate: {pub_date}")
date.weekday()


2025-06-15
2025-06-15 15:54:33.322648
pubdate: 2025-06-15


6

In [6]:
# Be sure to test that this works before running the full pull! Google Books limits api at 1000 per day.


def get_category(isbn):
    if (len(isbn)==13):
        # First, check cache
        try:
            category = df_cached[df_cached['primary_isbn13'] == isbn]['category'].iloc[0]
            print('isbn from cache')
            return category
        
        # If not in cache, call Google Books API
        except:

            response = requests.get(f'https://books.googleapis.com/books/v1/volumes?q=isbn%3A{isbn}&key={google_key}')
            data = response.json()

            if(response.status_code != 200):
                print(response.status_code)
                print(response.headers)

            try:
                category = data['items'][0]['volumeInfo']['categories'][0]
                print('isbn from api')
                return category
            except:
                category = data['items'][0]['volumeInfo']['categories']
                print('isbn from api')
                return category
            finally:
                return None

# uncomment to test   
get_category('9798217060672')

isbn from cache


'Political Science'

In [7]:
# Get NYT

dfs = []

weeks = 520
time_interval = 30

# start_date = '03/28/2025'
# start_date = datetime.strptime(start_date, '%m%d,%y')

# pub_date = start_date

for week in range(1,weeks+1):
    print(pub_date)
     # First, check cache
    if (df_cached[df_cached['published_date']==pub_date].shape[0] > 1):
            df = df_cached[df_cached['published_date']==pub_date]
            print('nyt list from cache')
            
    
    # If not in cache, call Google Books API
    else:
        response = requests.get(f'https://api.nytimes.com/svc/books/v3/lists/overview.json?api-key={nyt_key}&published-date={pub_date}')
        print('nyt list from api')

        if(response.status_code != 200):
            print(response.status_code)
            print(response.headers)
        data = response.json()['results']

        nf_combined=data['lists'][1]['books']

        df = pd.DataFrame(nf_combined)


        # Add categories by calling Google Books API
        df['category'] = df['primary_isbn13'].apply(get_category)
        df['published_date'] = data['published_date']

        # NYT API call limit of 5 per minute, 400 per day.
        time.sleep(12)

    dfs.append(df)

    date = (date - timedelta(time_interval))
    while (date.weekday() != 6):
        date = date - timedelta(1)
    pub_date = date.strftime("%Y-%m-%d")



2025-06-15
nyt list from cache
2025-05-11
nyt list from cache
2025-04-06
nyt list from cache
2025-03-02
nyt list from cache
2025-01-26
nyt list from cache
2024-12-22
nyt list from cache
2024-11-17
nyt list from cache
2024-10-13
nyt list from cache
2024-09-08
nyt list from cache
2024-08-04
nyt list from cache
2024-06-30
nyt list from cache
2024-05-26
nyt list from cache
2024-04-21
nyt list from cache
2024-03-17
nyt list from cache
2024-02-11
nyt list from cache
2024-01-07
nyt list from cache
2023-12-03
nyt list from cache
2023-10-29
nyt list from cache
2023-09-24
nyt list from cache
2023-08-20
nyt list from cache
2023-07-16
nyt list from cache
2023-06-11
nyt list from cache
2023-05-07
nyt list from cache
2023-04-02
nyt list from cache
2023-02-26
nyt list from cache
2023-01-22
nyt list from cache
2022-12-18
nyt list from cache
2022-11-13
nyt list from cache
2022-10-09
nyt list from cache
2022-09-04
nyt list from cache
2022-07-31
nyt list from cache
2022-06-26
nyt list from cache
2022-05-

KeyError: 'results'

In [8]:
df = pd.concat(dfs)
df.dropna(subset='category', inplace=True)
df['category']=df['category'].str.lower()

df['category']

# df.to_csv('year_sample.csv', index=False)

915             political science
916                    psychology
917                       medical
918     biography & autobiography
919          business & economics
                  ...            
2956                      fiction
2960                      fiction
2976                      fiction
2990                      fiction
2999                      fiction
Name: category, Length: 1837, dtype: object

In [None]:
# def cut_strings(x):
#     if (type(x) == str):
#         new_string = x.replace("['",'')
#         new_string = new_string.replace("']",'')
#         return new_string

# df['category'] = df['category'].apply(cut_strings)
# df_cached['category'] = df_cached['category'].apply(cut_strings)

In [9]:
# Combine current df with cached
df_cached = pd.concat([df_cached,df])

try:
    df_cached.drop(axis='columns', columns=['buy_links','isbns'], inplace=True)
except:
    pass


df_cached.drop_duplicates(subset=['published_date','primary_isbn13'], inplace=True)
df_cached.to_csv('df_cached.csv', index=False)


In [None]:
df['title'].value_counts()


title
THE BODY KEEPS THE SCORE      52
EDUCATED                      28
THE BOYS IN THE BOAT          26
KILLERS OF THE FLOWER MOON    26
HILLBILLY ELEGY               24
                              ..
THE ROUND HOUSE                1
THE PERFECT HOPE               1
THE SINS OF THE MOTHER         1
A WANTED MAN                   1
EMPIRE OF AI                   1
Name: count, Length: 890, dtype: int64

In [14]:
top10_cat = df['category'].value_counts().reset_index().head(10)


In [15]:
fig = px.bar(top10_cat, x = 'category', y = 'count')
fig.show()

In [16]:
fig = px.scatter(df, x = 'published_date', y = 'category', color='category',opacity=.25, hover_data='title')
fig.show()

In [19]:
top_cats = top10_cat['category'].to_list()

In [20]:
top_cats

['biography & autobiography',
 'history',
 'political science',
 'social science',
 'true crime',
 'science',
 'business & economics',
 'medical',
 'nature',
 'psychology']

In [23]:
df.columns

Index(['age_group', 'amazon_product_url', 'article_chapter_link', 'asterisk',
       'author', 'book_image', 'book_image_height', 'book_image_width',
       'book_review_link', 'book_uri', 'contributor', 'contributor_note',
       'created_date', 'dagger', 'description', 'first_chapter_link', 'price',
       'primary_isbn10', 'primary_isbn13', 'publisher', 'rank',
       'rank_last_week', 'sunday_review_link', 'title', 'updated_date',
       'weeks_on_list', 'category', 'published_date', 'Unnamed: 0'],
      dtype='object')

In [25]:
clean_df = df[['title','published_date','category','rank']]

In [26]:
clean_df.to_csv('clean_df.csv',index=False)