In [1]:
# Selenium imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
import time
# Local webdriver functions
from src.data.setup_webdriver import initialize_webdriver
from src.data.goodreads_login import login
# Goodreads User ID scraping functions
from src.data.scrape_users import gather_user_ids
from src.data.save_user_ids import save_raw_ids
from src.data.adjust_shelf import adjust_shelf_settings
from src.data.scroll_down import scroll_to_bottom
from src.data.read_review_elements import get_review
from src.data.scrape_shelf import scrape_user_read_shelf
# Other imports
from configparser import ConfigParser
import polars as pl
from pathlib import Path
import numpy as np
import pickle
from src import log
import time
import os
import re

In [2]:
user_ids_path = '../data/raw/user_ids.pkl'

In [3]:
def load_user_ids(filepath):
    with open(filepath, 'rb') as file:
        user_ids = pickle.load(file)
    return user_ids

In [4]:
user_ids = load_user_ids(user_ids_path)
print(f'There are {len(user_ids)} IDs, the first 5 are {user_ids[0:5]}.')

There are 316 IDs, the first 5 are ('33674708', '3990914', '51224285', '25683251', '4717304').


In [5]:
chrome_tools_path = '../chrome_tools'

In [6]:
# Configure driver
driver = initialize_webdriver(chrome_tools_path)

Successfully launched webdriver


In [7]:
# Find credentials
config = ConfigParser()
config.read('../src/data/config.ini')
username = config.get('credentials', 'username')
password = config.get('credentials', 'password')

In [8]:
authenticated_driver = login(driver, username, password)

Opened Goodreads successfully.
Reached sign-in options page.
Reached sign-in with email page.
Login attempted.
Login successful.


In [9]:
test = scrape_user_read_shelf(authenticated_driver, '161493250')

Opened shelf settings.
Settings adjusted. Closed settings panel.
100%|██████████| 5/5 [00:00<00:00, 11.00it/s]
Compiled reviews. 00:00


In [10]:
test.head()

title,author,avg_rating,user_rating,cover_url,user_id
str,str,f64,i64,str,str
"""Verity""","""""",4.33,1,"""https://i.gr-a…","""161493250"""
"""Daisy Jones & …","""""",4.22,4,"""https://i.gr-a…","""161493250"""
"""You Are a Bada…","""""",3.95,4,"""https://i.gr-a…","""161493250"""
"""One More Thing…","""""",3.67,2,"""https://i.gr-a…","""161493250"""
"""Bull Mountain …","""""",3.98,0,"""https://i.gr-a…","""161493250"""


In [11]:
# Track which users have been scraped
scraped_users_file = scraped_users_file = Path('scraped_users.txt')
if scraped_users_file.exists():
    scraped_users = set(scraped_users_file.read_text().splitlines())
else:
    scraped_users = set()

In [12]:
def save_dataframe(df, filepath):
    try:
        df.write_csv(filepath)
    except Exception as e:
        log.debug(f'Could not save to {filepath}.')
        raise

In [13]:
# save_dataframe(test, '161493250.csv')