# OUTLINE

1. Find list of items (checklist) we want scraped in the same way (include URLS and IDs (IDs can just be URL)). Finding this checklist is outside of the scope of this file/scraper. 
2. For each item X we want to scrape:
    * Run scraping function on X (and store scrape of X in folder)
        * ```scrape_and_save()```
    * Update checklist to reflect that X has been scraped (and save updated version of checklist)
        * ```update_checklist_table()```



# CONFIG

In [1]:
CHECKLIST_REL_PATH = 'Test_checklist/Checklist.csv'
SCRAPED_DATA_FOLDER_REL_PATH = 'scraped_data/'

CHECKLIST_COL_NAMES=dict(
    ID='ID',
    URL='URL',
    SCRAPED_BOOL='Scraped?'
)

# IMPORT PACKAGES

In [2]:
from IPython.display import display

import pandas as pd
import numpy as np

# SCRAPING UNIQUE TO THIS PROJECT

In [36]:
# import requests
# from bs4 import BeautifulSoup

# import time


def scrape_and_save(URL_to_scrape:str,
                    scraped_data_save_path:str,
                    scraped_filename:str) -> int: 
    """
    This function grabs data from the URL.
    MUST RETURN `True` if the scrape didn't work (i.e., if we want to log np.nan under "scraped_bool" field in 'checklist')
    """
    # # test fnc
    df = pd.read_html(URL_to_scrape)[0]
    print('scraped df =')
    display(df)
    df.to_csv(f'{scraped_data_save_path}{scraped_filename}.csv')
    # #

    conditions = len(df) != 0
    print(conditions)

    if conditions:
        bool_to_log = 1
    else:
        bool_to_log = np.nan

    return bool_to_log

# UPDATE CHECKLIST TABLE

In [44]:
def update_checklist_table(checklist_path:str, 
                        #    checklist:pd.DataFrame,
                           scraped_bool_col_:str,
                           URL_col:str,
                           URL_of_newly_scraped:str,
                           bool_to_log_=False
                           ):
    
    # This slows the code down a bit but unless we're scraping 000s of pages this shouldn't be the biggest bottleneck.
    # Reading and saving it every time is also more robust to error.
    checklist = pd.read_csv(checklist_path) 

    # confirm that bools are numerical and not True, False
    unique_bools = checklist[scraped_bool_col_].unique().tolist()
    if not len(set(unique_bools) - set([0, 1, np.nan])) == 0: # only contains 0, 1, or np.nan
        unexpected_chars = ', '.join(list(set(unique_bools) - set([0, 1, np.nan])))
        raise Exception(f'Unexpected Characters in {scraped_bool_col}: {unexpected_chars}')

    # confirm that there's only one entry corresponding to this URL
    if checklist[URL_col].nunique() != len(checklist):
        raise Exception(f'Duplicate URLs in {URL_col}')

    # update value
    checklist.set_index(URL_col, 
                        inplace=True)
    checklist.loc[URL_of_newly_scraped, scraped_bool_col_] = bool_to_log_

    # save/export
    checklist.to_csv(checklist_path,
                     index=True)

# MAIN FNC

In [70]:
def main():
    # config for readability
    scraped_bool_col = CHECKLIST_COL_NAMES['SCRAPED_BOOL']
    url_col = CHECKLIST_COL_NAMES['URL']
    id_col = CHECKLIST_COL_NAMES['ID']


    # read & clean
    checklist = pd.read_csv(CHECKLIST_REL_PATH)
    checklist[ scraped_bool_col ] = checklist[scraped_bool_col].map({False:0, True:1}).fillna(checklist[scraped_bool_col])

    display(checklist)

    # filter
    not_scraped_yet_filt = checklist[scraped_bool_col]==0
    URLs_to_scrape = checklist[not_scraped_yet_filt][url_col]

    display(checklist[not_scraped_yet_filt])

    for i, row in checklist[not_scraped_yet_filt].iterrows():
        url_to_scrape = row[url_col]
        id = row[id_col]
        
        bool_to_log = scrape_and_save(URL_to_scrape=url_to_scrape,
                                      scraped_data_save_path=SCRAPED_DATA_FOLDER_REL_PATH,
                                      scraped_filename=id)
        
        update_checklist_table(checklist_path=CHECKLIST_REL_PATH,
                               scraped_bool_col_=scraped_bool_col,
                               URL_col=url_col,
                               URL_of_newly_scraped=url_to_scrape,
                               bool_to_log_=bool_to_log)


In [71]:
main()

Unnamed: 0,URL,Scraped?,ID
0,https://en.wikipedia.org/wiki/100_metres,1.0,100
1,https://en.wikipedia.org/wiki/200_metres,0.0,200
2,https://en.wikipedia.org/wiki/400_metres,0.0,400


Unnamed: 0,URL,Scraped?,ID
1,https://en.wikipedia.org/wiki/200_metres,0.0,200
2,https://en.wikipedia.org/wiki/400_metres,0.0,400


https://en.wikipedia.org/wiki/200_metres
scraped df =


Unnamed: 0,Athletics200 metres,Athletics200 metres.1
0,Athletes leaving starting blocks for a 200 met...,Athletes leaving starting blocks for a 200 met...
1,World records,World records
2,Men,Usain Bolt 19.19 (2009)
3,Women,Florence Griffith-Joyner 21.34 (1988)
4,Olympic records,Olympic records
5,Men,Usain Bolt 19.30 (2008)
6,Women,Florence Griffith-Joyner 21.34 (1988)
7,World Championship records,World Championship records
8,Men,Usain Bolt 19.19 (2009)
9,Women,Dafne Schippers 21.63 (2015)


True
bool_to_log=1
https://en.wikipedia.org/wiki/400_metres
scraped df =


Unnamed: 0,Athletics400 metres,Athletics400 metres.1
0,The closing stages of a men's 400 m race.,The closing stages of a men's 400 m race.
1,World records,World records
2,Men,Wayde van Niekerk 43.03 (2016)
3,Women,Marita Koch 47.60 (1985)
4,Olympic records,Olympic records
5,Men,Wayde van Niekerk 43.03 (2016)
6,Women,Marie-José Pérec 48.25 (1996)
7,World Championship records,World Championship records
8,Men,Michael Johnson 43.18 (1999)
9,Women,Jarmila Kratochvílová 47.99 (1983)


True
bool_to_log=1
