In [1]:
import requests
import pandas as pd

import time

import os
from dotenv import load_dotenv

load_dotenv()

google_key = os.getenv("GOOGLE_BOOKS")
nyt_key = os.getenv("NYT")

In [2]:
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
pio.templates.default = "plotly_white"

In [3]:
df_cached = pd.read_csv('df_cached.csv')
df_cached['primary_isbn13'] = df_cached['primary_isbn13'].apply(str)

In [4]:
df_cached

Unnamed: 0.1,age_group,amazon_product_url,article_chapter_link,asterisk,author,book_image,book_image_height,book_image_width,book_review_link,book_uri,...,rank,rank_last_week,sunday_review_link,title,updated_date,weeks_on_list,category,published_date,Unnamed: 0,date_datetime
0,,https://www.amazon.com/dp/B0DTYKCJC9?tag=thene...,,0,Jake Tapper and Alex Thompson,https://static01.nyt.com/bestsellers/images/97...,500,329,,nyt://book/786e31c1-bc83-50f1-bd8d-995926458641,...,1,0,,ORIGINAL SIN,2025-06-02T23:13:46.798Z,1,Political Science,2025-06-08,,
1,,https://www.amazon.com/dp/1668023369?tag=thene...,,0,Dawn Staley,https://static01.nyt.com/bestsellers/images/97...,500,329,,nyt://book/93e1c5cf-1a3f-5ede-b081-ca9bbd7d30a2,...,2,0,,UNCOMMON FAVOR,2025-06-02T23:13:46.873Z,1,Biography & Autobiography,2025-06-08,,
2,,https://www.amazon.com/dp/0525561722?tag=thene...,,0,Ron Chernow,https://static01.nyt.com/bestsellers/images/97...,500,329,,nyt://book/b92f68c9-76ad-5510-8520-c5864d663b19,...,3,1,,MARK TWAIN,2025-06-02T23:13:46.947Z,2,Biography & Autobiography,2025-06-08,,
3,,https://www.amazon.com/dp/1668053373?tag=thene...,,0,Patrick McGee,https://static01.nyt.com/bestsellers/images/97...,500,331,,nyt://book/82ca74aa-bab8-5b05-9a1f-f8f7e383e1e2,...,4,0,,APPLE IN CHINA,2025-06-02T23:13:47.02Z,1,Business & Economics,2025-06-08,,
4,,https://www.amazon.com/dp/0593655036?tag=thene...,,0,Jonathan Haidt,https://static01.nyt.com/bestsellers/images/97...,500,329,,nyt://book/7557cf43-7888-5c15-8206-d3541cccd89b,...,5,2,,THE ANXIOUS GENERATION,2025-06-02T23:13:47.093Z,61,Psychology,2025-06-08,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4191,,https://www.amazon.com/We-Were-Eight-Years-Pow...,,0,Ta-Nehisi Coates,https://static01.nyt.com/bestsellers/images/97...,495,326,https://www.nytimes.com/2017/11/03/books/revie...,nyt://book/a9a88c8e-563f-5c2a-b16f-b0c3bb9a0c3e,...,11,10,,WE WERE EIGHT YEARS IN POWER,2025-05-14T09:34:13.54Z,6,biography & autobiography,2017-11-26,,2017-04-30 13:01:34.672838
4192,,https://www.amazon.com/What-Happened-Hillary-R...,,0,Hillary Clinton,https://static01.nyt.com/bestsellers/images/97...,495,328,,nyt://book/daadee57-173f-5ce7-926e-1e46bec21171,...,12,7,,WHAT HAPPENED,2025-05-14T09:34:13.468Z,9,biography & autobiography,2017-11-26,,2017-04-30 13:01:34.672838
4193,,https://www.amazon.com/Endurance-Year-Space-Li...,,0,Scott Kelly,https://static01.nyt.com/bestsellers/images/97...,482,330,https://www.nytimes.com/2017/12/07/books/revie...,nyt://book/c7e1f813-6d8d-5dd1-bcc6-4975f4f1f6f9,...,13,14,,ENDURANCE,2025-05-14T09:34:18.976Z,4,biography & autobiography,2017-11-26,,2017-04-30 13:01:34.672838
4194,,https://www.amazon.com/You-Cant-Spell-America-...,,0,Alec Baldwin and Kurt Andersen,https://static01.nyt.com/bestsellers/images/97...,495,325,,nyt://book/5c041301-c2fe-52ca-9993-999a49448282,...,14,0,,YOU CAN'T SPELL AMERICA WITHOUT ME,2025-05-14T09:29:17.821Z,1,humor,2017-11-26,,2017-04-30 13:01:34.672838


<!-- This is kind of interesting, but it doesn't show change over time. I've decided to use the NYT best sellers list, specifically non-fiction becuase I'm able to crossreference more specific categories from the Google Books API. The question becomes:

**How have American tastes in non-fiction changed over time?** -->

In [5]:
# Able to calculate date as loop through calls. Neet to remember to always call strftime.
from datetime import datetime
from datetime import timedelta, date
import dateutil.relativedelta


date = datetime.now()
print(date.strftime("%Y-%m-%d"))

while (date.weekday() != 6):
    date = date - timedelta(1)

print(date)
pub_date = date.strftime("%Y-%m-%d")
print(f"pubdate: {pub_date}")
date.weekday()


2025-06-26
2025-06-22 17:34:46.978746
pubdate: 2025-06-22


6

In [6]:
# Be sure to test that this works before running the full pull! Google Books limits api at 1000 per day.


def get_category(isbn):
    if (len(isbn)==13):
        # First, check cache
        try:
            category = df_cached[df_cached['primary_isbn13'] == isbn]['category'].iloc[0]
            print('isbn from cache')
            return category
        
        # If not in cache, call Google Books API
        except:

            response = requests.get(f'https://books.googleapis.com/books/v1/volumes?q=isbn%3A{isbn}&key={google_key}')
            data = response.json()

            if(response.status_code != 200):
                print(response.status_code)
                print(response.headers)

            try:
                category = data['items'][0]['volumeInfo']['categories'][0]
                print('isbn from api')
                return category
            except:
                category = data['items'][0]['volumeInfo']['categories']
                print('isbn from api')
                return category


# uncomment to test   
get_category('9798217060672')

isbn from cache


'Political Science'

In [7]:
# Get NYT

# This now returns an error, as the interval now runs longer than there is data to call. You might figure out a way to bypass this with except while still telling you what the error is.

dfs = []

weeks = 120
time_interval = 30

# start_date = '03/28/2025'
# start_date = datetime.strptime(start_date, '%m%d,%y')

# pub_date = start_date

for week in range(1,weeks+1):
    print(pub_date)
     # First, check cache
    if (df_cached[df_cached['published_date']==pub_date].shape[0] > 1):
            df = df_cached[df_cached['published_date']==pub_date]
            print('nyt list from cache')
            
    
    # If not in cache, call Google Books API
    else:
        try:
            response = requests.get(f'https://api.nytimes.com/svc/books/v3/lists/overview.json?api-key={nyt_key}&published-date={pub_date}')
            print('nyt list from api')

            if(response.status_code != 200):
                print(response.status_code)
                print(response.headers)
            data = response.json()['results']

            nf_combined=data['lists'][1]['books']

            df = pd.DataFrame(nf_combined)


            # Add categories by calling Google Books API
            df['category'] = df['primary_isbn13'].apply(get_category)
            df['published_date'] = data['published_date']

            # NYT API call limit of 5 per minute, 400 per day.
            time.sleep(12)
        except:
            if(response.status_code != 200):
                print(response.status_code)
                print(response.headers)
    
    df['date_datetime']=date
    dfs.append(df)


    date = (date - timedelta(time_interval))
    while (date.weekday() != 6):
        date = date - timedelta(1)
    pub_date = date.strftime("%Y-%m-%d")



2025-06-22
nyt list from cache
2025-05-18
nyt list from cache
2025-04-13
nyt list from cache
2025-03-09
nyt list from cache
2025-02-02
nyt list from cache
2024-12-29
nyt list from cache
2024-11-24
nyt list from cache
2024-10-20
nyt list from cache
2024-09-15
nyt list from cache
2024-08-11
nyt list from cache
2024-07-07
nyt list from cache
2024-06-02
nyt list from cache
2024-04-28
nyt list from cache
2024-03-24
nyt list from cache
2024-02-18
nyt list from cache
2024-01-14
nyt list from cache
2023-12-10
nyt list from cache
2023-11-05


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_datetime']=date


nyt list from api
isbn from cache
isbn from cache
isbn from cache
2023-10-01
nyt list from api
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
2023-08-27
nyt list from cache
2023-07-23
nyt list from cache
2023-06-18
nyt list from cache
2023-05-14
nyt list from cache
2023-04-09
nyt list from cache
2023-03-05
nyt list from cache
2023-01-29
nyt list from cache
2022-12-25
nyt list from cache
2022-11-20
nyt list from cache
2022-10-16
nyt list from cache
2022-09-11
nyt list from cache
2022-08-07
nyt list from cache
2022-07-03
nyt list from cache
2022-05-29
nyt list from cache
2022-04-24


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_datetime']=date


nyt list from api
isbn from cache
isbn from cache
isbn from cache
2022-03-20
nyt list from cache
2022-02-13
nyt list from cache
2022-01-09


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_datetime']=date


nyt list from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
2021-12-05
nyt list from cache
2021-10-31
nyt list from cache
2021-09-26
nyt list from cache
2021-08-22
nyt list from cache
2021-07-18
nyt list from cache
2021-06-13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_datetime']=date


nyt list from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from api
isbn from api
isbn from cache
2021-05-09
nyt list from api
isbn from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from cache
2021-04-04
nyt list from cache
2021-02-28
nyt list from cache
2021-01-24
nyt list from cache
2020-12-20
nyt list from cache
2020-11-15
nyt list from cache
2020-10-11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_datetime']=date


nyt list from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from api
2020-09-06
nyt list from cache
2020-08-02


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_datetime']=date


nyt list from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
2020-06-28
nyt list from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
2020-05-24
nyt list from api
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from api
isbn from api
isbn from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
2020-04-19
nyt list from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from api
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from cache

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_datetime']=date


nyt list from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
2018-07-29
nyt list from cache
2018-06-24


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_datetime']=date


nyt list from api
isbn from api
isbn from cache
isbn from api
isbn from api
2018-05-20
nyt list from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
2018-04-15
nyt list from cache
2018-03-11
nyt list from cache
2018-02-04
nyt list from cache
2017-12-31
nyt list from cache
2017-11-26
nyt list from cache
2017-10-22


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_datetime']=date


nyt list from api
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from api
isbn from cache
2017-09-17
nyt list from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
2017-08-13
nyt list from api
429
{'Date': 'Thu, 26 Jun 2025 21:38:23 GMT', 'Content-Type': 'application/json', 'Content-Length': '187', 'Connection': 'keep-alive', 'Access-Control-Allow-Headers': 'Accept, Content-Type, Origin, X-Forwarded-For, X-Prototype-Version, X-Requested-With', 'Access-Control-Allow-Methods': 'GET, OPTIONS', 'Access-Control-Allow-Origin': '*', 'Access-Control-Expose-Headers': 'Content-Length, X-JSON'}
429
{'Date': 'Thu, 26 Jun 2025 21:38:23 GMT', 'Content-Type': 'application/json', 'Content-Length': '187', 'Connection': 'keep-alive', 'Access-Control-Allow-Headers': 'Accept, Content-Ty

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_datetime']=date


nyt list from api
429
{'Date': 'Thu, 26 Jun 2025 21:38:25 GMT', 'Content-Type': 'application/json', 'Content-Length': '187', 'Connection': 'keep-alive', 'Access-Control-Allow-Headers': 'Accept, Content-Type, Origin, X-Forwarded-For, X-Prototype-Version, X-Requested-With', 'Access-Control-Allow-Methods': 'GET, OPTIONS', 'Access-Control-Allow-Origin': '*', 'Access-Control-Expose-Headers': 'Content-Length, X-JSON'}
429
{'Date': 'Thu, 26 Jun 2025 21:38:25 GMT', 'Content-Type': 'application/json', 'Content-Length': '187', 'Connection': 'keep-alive', 'Access-Control-Allow-Headers': 'Accept, Content-Type, Origin, X-Forwarded-For, X-Prototype-Version, X-Requested-With', 'Access-Control-Allow-Methods': 'GET, OPTIONS', 'Access-Control-Allow-Origin': '*', 'Access-Control-Expose-Headers': 'Content-Length, X-JSON'}
2016-01-31
nyt list from cache
2015-12-27
nyt list from cache
2015-11-22
nyt list from cache
2015-10-18
nyt list from cache
2015-09-13
nyt list from cache
2015-08-09
nyt list from cache


In [8]:
df = pd.concat(dfs)
df.dropna(subset='category', inplace=True)
df['category']=df['category'].str.lower()

df['category']

# df.to_csv('year_sample.csv', index=False)

3040         business & economics
3041            political science
3042    biography & autobiography
3043                         pets
3044                      history
                  ...            
3898                    education
3899    biography & autobiography
3900    biography & autobiography
3901    biography & autobiography
3902            political science
Name: category, Length: 1681, dtype: object

In [9]:
df.tail()

Unnamed: 0.1,age_group,amazon_product_url,article_chapter_link,asterisk,author,book_image,book_image_height,book_image_width,book_review_link,book_uri,...,sunday_review_link,title,updated_date,weeks_on_list,category,published_date,Unnamed: 0,date_datetime,isbns,buy_links
3898,,http://www.amazon.com/One-Nation-Under-Taught-...,,0,Vince M. Bertram,https://static01.nyt.com/bestsellers/images/97...,495,323,,nyt://book/744a666c-c8af-5fc6-a95e-ba6fd27356cf,...,,ONE NATION UNDER TAUGHT,2025-05-14T21:43:26.346Z,1,education,2014-12-07,,2014-01-26 17:34:46.978746,,
3899,,http://www.amazon.com/The-Innovators-Hackers-G...,,0,Walter Isaacson,https://static01.nyt.com/bestsellers/images/97...,193,128,https://www.nytimes.com/2014/10/09/arts/walter...,nyt://book/74ec3a0a-e7dd-51ea-a372-e0f99f575220,...,,THE INNOVATORS,2025-05-14T21:54:22.175Z,0,biography & autobiography,2014-12-07,,2014-01-26 17:34:46.978746,,
3900,,http://www.amazon.com/So-Anyway-John-Cleese-eb...,,0,John Cleese,https://static01.nyt.com/bestsellers/images/97...,192,128,,nyt://book/2ac79002-dd66-5355-aa33-9fa687c751f5,...,,SO ANYWAY . . .,2025-05-14T21:54:30.3Z,0,biography & autobiography,2014-12-07,,2014-01-26 17:34:46.978746,,
3901,,http://www.amazon.com/As-You-Wish-Inconceivabl...,,0,Cary Elwes with Joe Layden,https://static01.nyt.com/bestsellers/images/97...,194,128,,nyt://book/d6048d24-b0fe-5c00-867c-54466d1a8521,...,,AS YOU WISH,2025-05-14T21:54:32.7Z,0,biography & autobiography,2014-12-07,,2014-01-26 17:34:46.978746,,
3902,,http://www.amazon.com/Stonewalled-Obstruction-...,,0,Sharyl Attkisson,https://static01.nyt.com/bestsellers/images/97...,183,128,,nyt://book/86318b39-c538-519c-9ff9-4678f380d848,...,,STONEWALLED,2025-05-14T21:43:22.166Z,0,political science,2014-12-07,,2014-01-26 17:34:46.978746,,


In [10]:
# def cut_strings(x):
#     if (type(x) == str):
#         new_string = x.replace("['",'')
#         new_string = new_string.replace("']",'')
#         return new_string

# df['category'] = df['category'].apply(cut_strings)
# df_cached['category'] = df_cached['category'].apply(cut_strings)

In [11]:
# Combine current df with cached
df_cached = pd.concat([df_cached,df])

try:
    df_cached.drop(axis='columns', columns=['buy_links','isbns'], inplace=True)
except:
    pass


df_cached.drop_duplicates(subset=['published_date','primary_isbn13'], inplace=True)
df_cached.to_csv('df_cached.csv', index=False)


In [12]:
best_books = df.groupby(['title','category']).size().reset_index(name='count').sort_values(by='count',ascending=False).head(10)



In [13]:
def find_description(title):
    return df[df['title']==title]['description'].iloc[0]

test = find_description('BRIEF ANSWERS TO THE BIG QUESTIONS')
print(test)

A collection of essays from the late scientist’s personal archive that address 10 imponderables.


In [14]:
best_books['description'] = best_books['title'].apply(find_description)
best_books.head()

Unnamed: 0,title,category,count,description
517,THE BODY KEEPS THE SCORE,medical,45,"How trauma affects the body and mind, and inno..."
526,THE BOYS IN THE BOAT,sports & recreation,31,
169,EDUCATED,biography & autobiography,24,"The daughter of survivalists, who is kept out ..."
85,BEING MORTAL,family & relationships,24,The surgeon and New Yorker writer considers ho...
80,BECOMING,biography & autobiography,21,The former first lady describes her journey fr...


In [15]:
fig = px.bar(best_books, x='title',y='count', title="Weeks on NYT Best Sellers List", hover_data=['category', 'description'], color='category')
fig.show()

fig.write_image("images/top_titles.svg")

In [16]:
best_books

Unnamed: 0,title,category,count,description
517,THE BODY KEEPS THE SCORE,medical,45,"How trauma affects the body and mind, and inno..."
526,THE BOYS IN THE BOAT,sports & recreation,31,
169,EDUCATED,biography & autobiography,24,"The daughter of survivalists, who is kept out ..."
85,BEING MORTAL,family & relationships,24,The surgeon and New Yorker writer considers ho...
80,BECOMING,biography & autobiography,21,The former first lady describes her journey fr...
748,WILD,biography & autobiography,18,"A woman’s account of a life-changing 1,100-mil..."
242,HILLBILLY ELEGY,social science,18,"The vice president, in a memoir written shortl..."
707,UNBROKEN,biography & autobiography,17,An Olympic runner’s story of survival as a pri...
111,BORN A CRIME,biography & autobiography,16,A memoir about growing up biracial in aparthei...
272,I'M GLAD MY MOM DIED,biography & autobiography,15,The actress and filmmaker describes her eating...


In [17]:
best_books.head()

Unnamed: 0,title,category,count,description
517,THE BODY KEEPS THE SCORE,medical,45,"How trauma affects the body and mind, and inno..."
526,THE BOYS IN THE BOAT,sports & recreation,31,
169,EDUCATED,biography & autobiography,24,"The daughter of survivalists, who is kept out ..."
85,BEING MORTAL,family & relationships,24,The surgeon and New Yorker writer considers ho...
80,BECOMING,biography & autobiography,21,The former first lady describes her journey fr...


If biographies are not the best books, why are they on the list so often?

In [18]:
top10_cat = df['category'].value_counts().reset_index().head(10)


In [19]:
fig = px.bar(top10_cat, x = 'category', y = 'count')
fig.show()

fig.write_image("images/top10_count.svg")
fig.write_image("images/top10_count.png", width='1000px')



In [20]:
fig = px.scatter(df[(df['rank']<6)], x = 'published_date', y = 'rank', color='category',opacity=1, hover_data='title',)
fig.update_traces(marker=dict(size=10))
fig.show()

fig.write_image("images/dots.svg")
fig.write_html("images/dots.html")

In [21]:
top_cats = top10_cat['category'].to_list()

In [22]:
top_cats

['biography & autobiography',
 'history',
 'political science',
 'social science',
 'science',
 'true crime',
 'medical',
 'business & economics',
 'family & relationships',
 'sports & recreation']

In [23]:
df.columns

Index(['age_group', 'amazon_product_url', 'article_chapter_link', 'asterisk',
       'author', 'book_image', 'book_image_height', 'book_image_width',
       'book_review_link', 'book_uri', 'contributor', 'contributor_note',
       'created_date', 'dagger', 'description', 'first_chapter_link', 'price',
       'primary_isbn10', 'primary_isbn13', 'publisher', 'rank',
       'rank_last_week', 'sunday_review_link', 'title', 'updated_date',
       'weeks_on_list', 'category', 'published_date', 'Unnamed: 0',
       'date_datetime', 'isbns', 'buy_links'],
      dtype='object')

In [24]:
clean_df = df[['title','published_date','category','rank', 'date_datetime']]
clean_df=clean_df[clean_df['category'].str.contains('fiction')==False]
clean_df.tail()

Unnamed: 0,title,published_date,category,rank,date_datetime
3898,ONE NATION UNDER TAUGHT,2014-12-07,education,15,2014-01-26 17:34:46.978746
3899,THE INNOVATORS,2014-12-07,biography & autobiography,16,2014-01-26 17:34:46.978746
3900,SO ANYWAY . . .,2014-12-07,biography & autobiography,18,2014-01-26 17:34:46.978746
3901,AS YOU WISH,2014-12-07,biography & autobiography,19,2014-01-26 17:34:46.978746
3902,STONEWALLED,2014-12-07,political science,20,2014-01-26 17:34:46.978746


In [25]:
nyt_cats=clean_df['category'].unique()
nyt_cats

array(['business & economics', 'political science',
       'biography & autobiography', 'pets', 'history', 'psychology',
       'medical', 'art', 'religion', 'social science', 'health & fitness',
       'true crime', 'communicable diseases', 'nature', 'performing arts',
       'humor', 'science', 'body, mind & spirit', 'law',
       'sports & recreation', 'family & relationships', 'soldiers',
       'photography', 'music', 'literary collections', 'education',
       'self-help', 'language arts & disciplines', 'philosophy',
       'mathematics', 'cold war', 'equality', 'computers', 'biography',
       'anti-communist movements', 'south african war, 1899-1902',
       'cooking', 'olympic games (11th : 1936 : berlin, germany)',
       'technology & engineering', 'billionaires', 'travel',
       'autobiographies'], dtype=object)

In [26]:
clean_df.to_csv('clean_df.csv',index=False)

Could it be there are just more biographies? This is a 5 year old scrape of 46k books from Goodreads. I can run the ISBNs to get the same category information but this will take a few weeks even if it runs every day due to Google Books API limitation. There is no good way to filter out fiction before running them.

In [27]:
books = pd.read_csv('books.csv')

In [28]:
sample_cached = pd.read_csv('sample_cached.csv')
sample_cached.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,category
0,28195,Inkspell (Inkworld #2),Cornelia Funke/Anthea Bell,3.92,439554004,9780440000000.0,juvenile fiction
1,20001,Interfictions: An Anthology of Interstitial Wr...,Delia Sherman/Theodora Goss/K. Tempest Bradfor...,3.77,1931520240,9781930000000.0,
2,32286,A Mad People's History of Madness,Dale Peterson,4.33,822953315,9780820000000.0,psychology
3,38583,Bite Me!: An Unofficial Guide to the World of ...,Nikki Stafford,4.13,1550225405,9781550000000.0,young adult fiction
4,38175,The South Beach Diet: The Delicious Doctor-De...,Arthur Agatston,3.45,1579546463,9781580000000.0,


In [29]:
# Be sure to test that this works before running the full pull! Google Books limits api at 1000 per day.


def get_category_by_title(book):

# First, check cache
    try:
        category = sample_cached[sample_cached['isbn'] == book['isbn']]['category'].iloc[0]
        print('isbn from cache')
        return category

    # If not in cache, call Google Books API
    except:
        query =(book['title']+" "+book['authors']).replace(" ","_")
        

        response = requests.get(f'https://books.googleapis.com/books/v1/volumes?q={query}&key={google_key}')
        data = response.json()

        if(response.status_code != 200):
            print(response.status_code)
            print(response.headers)

        if(data['totalItems']!=0):

            try:
                category = data['items'][0]['volumeInfo']['categories'][0]
                print('isbn from api')
                return category
            except:
                pass

# uncomment to test   
# test_cat = get_category_by_title(sample_cached.iloc[0])
# print(test_cat)


In [30]:
#sample 100 rows to start with

books_sample = books.sample(900)

books_sample['category'] = books_sample.apply(get_category_by_title, axis=1)
books_sample['category'] = books_sample['category'].str.lower()

isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from api
isbn from api
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from api
isbn from api
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from api
isbn from api
isbn from api
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from cache
isbn from api
isbn from api
isbn from cache
isbn from api
isbn from cache
isbn from api
isbn from api
isbn from api
isbn from api
isbn from cache
isbn from cache
isbn from api
isbn from cache
isbn from cache
isbn from api
isbn from api
isbn from api
isbn from cache
isbn from cache
isbn from api
isbn from 

In [31]:
sample_cached['category'] = sample_cached['category'].str.lower()


In [32]:
# Combine current df with cached
sample_cached = pd.concat([sample_cached,books_sample])

sample_cached.drop_duplicates(subset=['isbn','category'], inplace=True)
sample_cached.to_csv('sample_cached.csv', index=False)


In [33]:
sample_cached['category'].value_counts()

category
fiction                      1181
juvenile fiction              177
biography & autobiography     151
literary criticism             91
history                        90
                             ... 
poets, american                 1
photography of the nude         1
change                          1
ireland                         1
dysfunctional families          1
Name: count, Length: 302, dtype: int64

In [34]:
# nf_sample = sample_cached[sample_cached["category"].isin(drop_cols) == False]
nf_sample = sample_cached[sample_cached['category'].isin(nyt_cats)]

In [35]:
nf_percents = nf_sample['category'].value_counts(normalize=True).reset_index()
nf_percents.rename(columns={'proportion':'percent'},inplace=True)

In [36]:
nf_sample['category'].info

<bound method Series.info of 2                      psychology
7                  social science
11                     philosophy
22                        history
27      biography & autobiography
                  ...            
1469                   philosophy
7761          body, mind & spirit
3014             health & fitness
8302          body, mind & spirit
4688               social science
Name: category, Length: 811, dtype: object>

In [37]:
nf_percents

Unnamed: 0,category,percent
0,biography & autobiography,0.18619
1,history,0.110974
2,philosophy,0.094945
3,religion,0.071517
4,business & economics,0.045623
5,social science,0.039457
6,language arts & disciplines,0.035758
7,self-help,0.030826
8,political science,0.030826
9,science,0.030826


In [38]:
nyt_percents = clean_df['category'].value_counts(normalize=True).reset_index()
nyt_percents.rename(columns={'proportion':'percent'},inplace=True)

In [39]:
nyt_percents

Unnamed: 0,category,percent
0,biography & autobiography,0.464907
1,history,0.105579
2,political science,0.092981
3,social science,0.043191
4,science,0.028794
5,medical,0.026995
6,true crime,0.026995
7,business & economics,0.025795
8,family & relationships,0.023995
9,sports & recreation,0.019196


In [40]:
nf_percents['list']='nf'
nyt_percents['list']='nyt'

combined_percents = pd.concat([nf_percents,nyt_percents])

In [41]:
combined_percents.head()

Unnamed: 0,category,percent,list
0,biography & autobiography,0.18619,nf
1,history,0.110974,nf
2,philosophy,0.094945,nf
3,religion,0.071517,nf
4,business & economics,0.045623,nf


In [42]:
nf_sample.shape

(811, 7)

In [43]:
fig = px.bar(combined_percents[combined_percents['category'].isin(top_cats)], x='category', y='percent', color='list', barmode='group', template='plotly_white')

fig.show()

fig.write_image("images/percents.svg")
fig.write_image("images/percents.png")

In [44]:
df.columns

Index(['age_group', 'amazon_product_url', 'article_chapter_link', 'asterisk',
       'author', 'book_image', 'book_image_height', 'book_image_width',
       'book_review_link', 'book_uri', 'contributor', 'contributor_note',
       'created_date', 'dagger', 'description', 'first_chapter_link', 'price',
       'primary_isbn10', 'primary_isbn13', 'publisher', 'rank',
       'rank_last_week', 'sunday_review_link', 'title', 'updated_date',
       'weeks_on_list', 'category', 'published_date', 'Unnamed: 0',
       'date_datetime', 'isbns', 'buy_links'],
      dtype='object')

In [45]:
sample=df.sample(5)[['author','title','category','rank','published_date']].to_csv('sample.csv')

In [46]:
clean_df

Unnamed: 0,title,published_date,category,rank,date_datetime
3040,HOW COUNTRIES GO BROKE,2025-06-22,business & economics,1,2025-06-22 17:34:46.978746
3041,ORIGINAL SIN,2025-06-22,political science,2,2025-06-22 17:34:46.978746
3042,A DIFFERENT KIND OF POWER,2025-06-22,biography & autobiography,3,2025-06-22 17:34:46.978746
3043,THIS DOG WILL CHANGE YOUR LIFE,2025-06-22,pets,4,2025-06-22 17:34:46.978746
3044,TRUMP'S TRIUMPH,2025-06-22,history,5,2025-06-22 17:34:46.978746
...,...,...,...,...,...
3898,ONE NATION UNDER TAUGHT,2014-12-07,education,15,2014-01-26 17:34:46.978746
3899,THE INNOVATORS,2014-12-07,biography & autobiography,16,2014-01-26 17:34:46.978746
3900,SO ANYWAY . . .,2014-12-07,biography & autobiography,18,2014-01-26 17:34:46.978746
3901,AS YOU WISH,2014-12-07,biography & autobiography,19,2014-01-26 17:34:46.978746


In [47]:
fig = px.bar(clean_df.iloc[:150],x='published_date',color='category')

fig.show()

In [48]:
cat_counts = clean_df['category'].value_counts()

In [49]:
# useful libraries, including pyWaffle
import matplotlib.pyplot as plt
from pywaffle import Waffle


In [50]:
clean_df.groupby(['date_datetime','published_date','category']).size().reset_index(name='count')

Unnamed: 0,date_datetime,published_date,category,count
0,2014-01-26 17:34:46.978746,2014-12-07,biography & autobiography,110
1,2014-01-26 17:34:46.978746,2014-12-07,education,10
2,2014-01-26 17:34:46.978746,2014-12-07,family & relationships,10
3,2014-01-26 17:34:46.978746,2014-12-07,history,10
4,2014-01-26 17:34:46.978746,2014-12-07,literary collections,10
...,...,...,...,...
681,2025-06-22 17:34:46.978746,2025-06-22,history,2
682,2025-06-22 17:34:46.978746,2025-06-22,medical,1
683,2025-06-22 17:34:46.978746,2025-06-22,pets,1
684,2025-06-22 17:34:46.978746,2025-06-22,political science,1


In [51]:
past_year = datetime.now() - timedelta(days=365)
past_year

datetime.datetime(2024, 6, 26, 17, 42, 46, 59796)

In [52]:
clean_df[clean_df['date_datetime']>past_year]

Unnamed: 0,title,published_date,category,rank,date_datetime
3040,HOW COUNTRIES GO BROKE,2025-06-22,business & economics,1,2025-06-22 17:34:46.978746
3041,ORIGINAL SIN,2025-06-22,political science,2,2025-06-22 17:34:46.978746
3042,A DIFFERENT KIND OF POWER,2025-06-22,biography & autobiography,3,2025-06-22 17:34:46.978746
3043,THIS DOG WILL CHANGE YOUR LIFE,2025-06-22,pets,4,2025-06-22 17:34:46.978746
3044,TRUMP'S TRIUMPH,2025-06-22,history,5,2025-06-22 17:34:46.978746
...,...,...,...,...,...
3169,A WALK IN THE PARK,2024-07-07,biography & autobiography,11,2024-07-07 17:34:46.978746
3170,THE SITUATION ROOM,2024-07-07,history,12,2024-07-07 17:34:46.978746
3171,WHEN THE CLOCK BROKE,2024-07-07,history,13,2024-07-07 17:34:46.978746
3172,OUTLIVE,2024-07-07,health & fitness,14,2024-07-07 17:34:46.978746


In [53]:
past_year_counts = clean_df[clean_df['date_datetime']>past_year].groupby(['date_datetime','published_date','category']).size().reset_index(name='count')

In [54]:
past_year_counts

Unnamed: 0,date_datetime,published_date,category,count
0,2024-07-07 17:34:46.978746,2024-07-07,biography & autobiography,7
1,2024-07-07 17:34:46.978746,2024-07-07,health & fitness,1
2,2024-07-07 17:34:46.978746,2024-07-07,history,3
3,2024-07-07 17:34:46.978746,2024-07-07,humor,1
4,2024-07-07 17:34:46.978746,2024-07-07,medical,1
...,...,...,...,...
85,2025-06-22 17:34:46.978746,2025-06-22,history,2
86,2025-06-22 17:34:46.978746,2025-06-22,medical,1
87,2025-06-22 17:34:46.978746,2025-06-22,pets,1
88,2025-06-22 17:34:46.978746,2025-06-22,political science,1


In [55]:
past_year_counts.to_csv('past_year_counts.csv', index=False)

In [56]:
type(past_year_counts)

pandas.core.frame.DataFrame

In [57]:
past_year_counts.to_json('past_year_counts.json')

In [58]:
year_counts_list = []
for week in past_year_counts['published_date'].unique():
    print(f"week:{week}")
    week_object = {}
    week_object["week"] = week
    week_object["data"] = []


    for category in past_year_counts[past_year_counts['published_date']== week]['category'].unique():
        row = {}
        row["category"] = category
        print(row["category"])
        row['count'] = past_year_counts[(past_year_counts['published_date']== week) & (past_year_counts["category"]== category)]['count'].iloc[0]
        row["count"] = int(row['count'])
        print(row['count'])
        week_object['data'].append(row)


    
    year_counts_list.append(week_object)


week:2024-07-07
biography & autobiography
7
health & fitness
1
history
3
humor
1
medical
1
psychology
1
true crime
1
week:2024-08-11
biography & autobiography
6
health & fitness
1
history
3
medical
1
psychology
1
social science
1
true crime
1
week:2024-09-15
biography & autobiography
5
business & economics
1
health & fitness
1
history
2
medical
1
nature
1
performing arts
1
psychology
1
social science
1
true crime
1
week:2024-10-20
biography & autobiography
4
business & economics
1
history
4
medical
1
political science
1
psychology
1
social science
3
week:2024-11-24
biography & autobiography
8
business & economics
1
medical
1
political science
1
psychology
1
social science
2
true crime
1
week:2024-12-29
biography & autobiography
5
business & economics
1
history
2
medical
1
nature
2
psychology
1
social science
1
true crime
2
week:2025-02-02
biography & autobiography
7
health & fitness
1
medical
1
nature
1
political science
2
psychology
1
religion
1
true crime
1
week:2025-03-09
biography 

In [59]:
year_counts_list

[{'week': '2024-07-07',
  'data': [{'category': 'biography & autobiography', 'count': 7},
   {'category': 'health & fitness', 'count': 1},
   {'category': 'history', 'count': 3},
   {'category': 'humor', 'count': 1},
   {'category': 'medical', 'count': 1},
   {'category': 'psychology', 'count': 1},
   {'category': 'true crime', 'count': 1}]},
 {'week': '2024-08-11',
  'data': [{'category': 'biography & autobiography', 'count': 6},
   {'category': 'health & fitness', 'count': 1},
   {'category': 'history', 'count': 3},
   {'category': 'medical', 'count': 1},
   {'category': 'psychology', 'count': 1},
   {'category': 'social science', 'count': 1},
   {'category': 'true crime', 'count': 1}]},
 {'week': '2024-09-15',
  'data': [{'category': 'biography & autobiography', 'count': 5},
   {'category': 'business & economics', 'count': 1},
   {'category': 'health & fitness', 'count': 1},
   {'category': 'history', 'count': 2},
   {'category': 'medical', 'count': 1},
   {'category': 'nature', 'co

In [60]:
import json
with open('year_counts_list.json', 'w') as f:
    json.dump(year_counts_list, f)