In [1]:
from gazpacho import get, Soup

In [2]:
url = 'https://www.goodreads.com/review/list/16626766'

In [3]:
params = {
    'shelf': 'read',
    'order': 'd',
    'sort': 'date_read',
    'per_page': 30, # actually can't change
    'page': 1
}

html = get(url, params)

In [4]:
soup = Soup(html)

In [5]:
trs = soup.find('tr', {'class': 'bookalike review'})
len(trs)

30

In [6]:
tr = trs[0]

In [7]:
tr.find('a', {'href': '/book/show/'})[1].attrs['title']

'Anatomy'

In [8]:
tr.find('a', {'href': '/author/show'}).text

'Vigil, Karina'

In [9]:
tr.find('span', {'class': 'date_started_value'}).text

'Feb 29, 2020'

In [10]:
tr.find('span', {'class': 'date_read_value'}).text

'Mar 2020'

In [11]:
tr.find('nobr').remove_tags()

'70 pp'

In [12]:
def parse_tr(tr):
    return {
        'title': tr.find('a', {'href': '/book/show/'})[1].attrs['title'],
        'author': tr.find('a', {'href': '/author/show'}).text,
        'end': tr.find('span', {'class': 'date_read_value'}).text,
    }

In [13]:
books = [parse_tr(tr) for tr in trs]

In [14]:
books[:3]

[{'title': 'Anatomy', 'author': 'Vigil, Karina', 'end': 'Mar 2020'},
 {'title': 'Virtue Signaling: Essays on Darwinian Politics & Free Speech',
  'author': 'Miller, Geoffrey',
  'end': 'Feb 29, 2020'},
 {'title': 'The Shoe on the Roof',
  'author': 'Ferguson, Will',
  'end': 'Jan 17, 2020'}]

In [15]:
def scrape_page(user=16626766, page=1):
    url = f'https://www.goodreads.com/review/list/{user}'
    params = {
        'shelf': 'read',
        'order': 'd',
        'sort': 'date_read',
        'page': page
    }
    html = get(url, params)
    
#     # "a honking good idea"
#     with open(f'data/{user}-{page}.html', 'w') as f:
#         f.write(html)
        
    soup = Soup(html)
    trs = soup.find('tr', {'class': 'bookalike review'})
    books = [parse_tr(tr) for tr in trs]
    return books

In [16]:
books = []
for page in [1, 2]:
    books.extend(scrape_page(page=page))

In [17]:
from tqdm import tqdm
import time

In [18]:
books = []
pages = 2 * 52 // 10 + 1
print(pages)
for page in tqdm(range(1, pages+1)):
    books.extend(scrape_page(page=page))
    time.sleep(1)

  0%|          | 0/11 [00:00<?, ?it/s]

11


100%|██████████| 11/11 [01:29<00:00,  8.17s/it]


In [19]:
import pandas as pd

In [21]:
df = pd.DataFrame(books)

df.to_csv('../data/books.csv', index=False)

In [32]:
df = pd.read_csv('../data/books.csv')

In [33]:
df['end'] = pd.to_datetime(df['end'])
df = df.sort_values('end', ascending=True)
df['year'] = df['end'].dt.year
df = df[df['year'] >= 2018]
df['week'] = df['end'].dt.dayofyear // 7 + 1
df['read'] = 1

stats = df.groupby(['year', 'week'])['read'].count().groupby(['year']).cumsum()
df = stats.reset_index()

In [34]:
import altair as alt

In [35]:
chart = (
    alt.Chart(df)
    .mark_line(interpolate='basis')
    .encode(
        x='week',
        y='read',
        color='year:O')
    .properties(
        height=300, 
        width=300,
        title='goodreads Challenge'
    )
)

chart

In [41]:
from utils import interactive_chart

interactive_chart(df)

### Exercise 

Use Wikipedia to scrape the ages for: Andrew Yang, Joe Biden, and Donald Trump (or another 2020 Presidential Candidate!)

In [None]:
%load ../solutions/1-code.py