# OUTLINE

1. Find list of items (checklist) we want scraped in the same way (include URLS and IDs (IDs can just be URL)). Finding this checklist is outside of the scope of this file/scraper. 
2. For each item X we want to scrape:
    * Run scraping function on X (and store scrape of X in folder)
        * ```scrape_and_save()```
    * Update checklist to reflect that X has been scraped (and save updated version of checklist)
        * ```update_checklist_table()```



# CONFIG

In [72]:
# rel location of checklist of URLs to scrape
CHECKLIST_REL_PATH = '../checklist/checklist.csv'

# rel location of folder where we store the scraped data
SCRAPED_DATA_FOLDER_REL_PATH = '../scraped_data/'

# names of relevant columns in our checklist dataframe
CHECKLIST_COL_NAMES=dict(
    ID=...,
    URL=...,
    SCRAPED_BOOL=...
)

# IMPORT PACKAGES

In [73]:
import pandas as pd
import numpy as np

# SCRAPING UNIQUE TO THIS PROJECT

In [36]:
# import requests
# from bs4 import BeautifulSoup

# import time

def scrape_and_save(URL_to_scrape:str,
                    scraped_data_save_path:str,
                    scraped_filename:str) -> int: 
    """
    This function grabs data from the URL.
    MUST RETURN `True` if the scrape didn't work (i.e., if we want to log np.nan under "scraped_bool" field in 'checklist')
    """

    # scrape here. leave the rest of this block's code untouched

    # figure out what value to log in the checklist (i.e., was this successfully scraped?)
    conditions = ...
    if conditions:
        bool_to_log = 1
    else:
        bool_to_log = np.nan
    return bool_to_log

# UPDATE CHECKLIST TABLE

In [44]:
def update_checklist_table(checklist_path:str, 
                        #    checklist:pd.DataFrame,
                           scraped_bool_col_:str,
                           URL_col:str,
                           URL_of_newly_scraped:str,
                           bool_to_log_=False
                           ):
    
    # This slows the code down a bit but unless we're scraping 000s of pages this shouldn't be the biggest bottleneck.
    # Reading and saving it every time is also more robust to error.
    checklist = pd.read_csv(checklist_path) 

    # confirm that bools are numerical and not True, False
    unique_bools = checklist[scraped_bool_col_].unique().tolist()
    if not len(set(unique_bools) - set([0, 1, np.nan])) == 0: # only contains 0, 1, or np.nan
        unexpected_chars = ', '.join(list(set(unique_bools) - set([0, 1, np.nan])))
        raise Exception(f'Unexpected Characters in {scraped_bool_col_}: {unexpected_chars}')

    # confirm that there's only one entry corresponding to this URL
    if checklist[URL_col].nunique() != len(checklist):
        raise Exception(f'Duplicate URLs in {URL_col}')

    # update value
    checklist.set_index(URL_col, 
                        inplace=True)
    checklist.loc[URL_of_newly_scraped, scraped_bool_col_] = bool_to_log_

    # save/export
    checklist.to_csv(checklist_path,
                     index=True)

# MAIN FNC

In [70]:
def main():
    # config for readability
    scraped_bool_col = CHECKLIST_COL_NAMES['SCRAPED_BOOL']
    url_col = CHECKLIST_COL_NAMES['URL']
    id_col = CHECKLIST_COL_NAMES['ID']

    # read & clean
    checklist = pd.read_csv(CHECKLIST_REL_PATH)
    checklist[ scraped_bool_col ] = checklist[scraped_bool_col].map({False:0, True:1}).fillna(checklist[scraped_bool_col])

    # filter out 
    not_scraped_yet_filt = checklist[scraped_bool_col]==0

    for i, row in checklist[not_scraped_yet_filt].iterrows():
        # config for readability
        url_to_scrape = row[url_col]
        id = row[id_col]
        
        bool_to_log = scrape_and_save(URL_to_scrape=url_to_scrape,
                                      scraped_data_save_path=SCRAPED_DATA_FOLDER_REL_PATH,
                                      scraped_filename=id)
        
        update_checklist_table(checklist_path=CHECKLIST_REL_PATH,
                               scraped_bool_col_=scraped_bool_col,
                               URL_col=url_col,
                               URL_of_newly_scraped=url_to_scrape,
                               bool_to_log_=bool_to_log)


# RUN

In [None]:
main()