# OUTLINE

1. Find list of items (checklist) we want scraped in the same way (include URLS and IDs (IDs can just be URL)). Finding this checklist is outside of the scope of this file/scraper. 
2. For each item X we want to scrape:
    * Run scraping function on X (and store scrape of X in folder)
        * ```scrape_and_save()```
    * Update checklist to reflect that X has been scraped (and save updated version of checklist)
        * ```update_checklist_table()```



# CONFIG

In [None]:
# rel location of checklist of URLs to scrape
CHECKLIST_REL_PATH = '../checklist/checklist.csv'

# rel location of folder where we store the scraped data
SCRAPED_DATA_FOLDER_REL_PATH = '../scraped_data/'

# names of relevant columns in our checklist dataframe
CHECKLIST_COL_NAMES=dict(
    ID='ID',
    URL='Link',
    SCRAPED_BOOL='has_been_scraped'
)

# IMPORT PACKAGES

In [None]:
import pandas as pd
import numpy as np

# SCRAPING UNIQUE TO THIS PROJECT

In [None]:
# import requests
# from bs4 import BeautifulSoup

import time

def get_a_markets_transactions(market_ID:str, 
                               delay_mean=.5,
                               delay_std=.2) -> pd.DataFrame:
    """delay is measured in seconds"""

    market_url = 'https://polymarketwhales.info/transactions?orderBy=timestamp&market=' + market_ID
    
    transactions = []
    page_count = 1
    still_more_pages = True
    
    while still_more_pages:
        delay = np.random.normal(loc=delay_mean, scale=delay_std, size=1)[0]
        time.sleep(delay)
        
        url_suffix = f'&page={page_count}'
        
        # transactions_to_append = get_yes_prices_from_url(market_url + url_suffix)
        transactions_to_append = pd.read_html(market_url + url_suffix)[0]
        
        if len(transactions_to_append) > 0:    
            transactions.append(transactions_to_append)
            page_count += 1
        else:
            still_more_pages=False       

    # return transactions
    if page_count > 1:
        return pd.concat(transactions)
    else:
        return transactions[0]

def clean_a_markets_transactions(table:pd.DataFrame) -> pd.DataFrame:
    
    # map inefficient datatypes to ints
    table['Type'] = table['Type'].map({'Buy':1, 'Sell':0})
    table['Outcome'] = table['Outcome'].map({'Yes':1, 'No':0})
    table['Direction'] = table['Direction'].map({'✅':1, '❌':0})

    # clean 'Amount' (str-->float)
    table['Amount'] = table['Amount'].str.split('$').str[1].astype(float)

    # convert to Yes Price
    table['Yes Price'] = np.where(table['Outcome']==1, 
                                  table['Price'],
                                  1-table['Price'])
        
    # reverse order of table to go from first trades to last
    table = table[::-1].reset_index()

    return table.drop(['index', 'Timestamp', 'Unnamed: 8', 'Unnamed: 9'], axis='columns')

def scrape_and_save(URL_to_scrape:str,
                    scraped_data_save_path:str,
                    scraped_filename:str,
                    ID:str) -> int: 
    """
    This function grabs data from the URL.
    MUST RETURN `True` if the scrape didn't work (i.e., if we want to log np.nan under "scraped_bool" field in 'checklist')
    """

    # scrape
    try:
        df = get_a_markets_transactions(market_ID=ID)
        df = clean_a_markets_transactions(df)
        df.to_csv(f'{scraped_data_save_path}{scraped_filename}.csv',
                  index=False)
        something_went_wrong = 0
    except:
        print('something went wrong with scraping')
        something_went_wrong = 1
        df = []

    # figure out what value to log in the checklist (i.e., was this successfully scraped?)
    conditions = (len(df) > 0) and (not something_went_wrong)
    if conditions:
        bool_to_log = 1.0
    else:
        bool_to_log = np.nan
    return bool_to_log

# UPDATE CHECKLIST TABLE

In [None]:
def update_checklist_table(checklist_path:str, 
                        #    checklist:pd.DataFrame,
                           scraped_bool_col_:str,
                           URL_col:str,
                           URL_of_newly_scraped:str,
                           bool_to_log_=False
                           ):
    
    global unexpected_chars

    # This slows the code down a bit but unless we're scraping 000s of pages this shouldn't be the biggest bottleneck.
    # Reading and saving it every time is also more robust to error.
    checklist = pd.read_csv(checklist_path) 

    # confirm that bools are numerical and not True, False
    unique_bools = checklist[scraped_bool_col_].dropna().unique().tolist()
    if not len(set(unique_bools) - set([0, 1, 0.0, 1.0, np.nan])) == 0: # only contains 0, 1, or np.nan
        unexpected_chars = unique_bools
        raise Exception(f'Unexpected Characters in {scraped_bool_col_}: {unique_bools}')

    # confirm that there's only one entry corresponding to this URL
    if checklist[URL_col].dropna().nunique() != len(checklist[URL_col].dropna()):
        raise Exception(f'Duplicate URLs in {URL_col}')

    # update value
    checklist.set_index(URL_col, 
                        inplace=True)
    checklist.loc[URL_of_newly_scraped, scraped_bool_col_] = bool_to_log_

    # save/export
    checklist.to_csv(checklist_path,
                     index=True)

# MAIN FNC

In [None]:
def main():
    # config for readability
    scraped_bool_col = CHECKLIST_COL_NAMES['SCRAPED_BOOL']
    url_col = CHECKLIST_COL_NAMES['URL']
    id_col = CHECKLIST_COL_NAMES['ID']

    # read & clean
    checklist = pd.read_csv(CHECKLIST_REL_PATH)
    checklist[ scraped_bool_col ] = checklist[scraped_bool_col].map({False:0, True:1}).fillna(checklist[scraped_bool_col])

    # filter out 
    not_scraped_yet_filt = checklist[scraped_bool_col]==0

    for i, row in checklist[not_scraped_yet_filt].iterrows():
        first_delay = max(np.random.normal(loc=.2, scale=.1, size=1)[0], 0)
        time.sleep(first_delay)
        print(i, row['Question'])

        # config for readability
        url_to_scrape = row[url_col]
        id = row[id_col]
        
        bool_to_log = scrape_and_save(URL_to_scrape=url_to_scrape,
                                      scraped_data_save_path=SCRAPED_DATA_FOLDER_REL_PATH,
                                      scraped_filename=id,
                                      ID=id)
        
        update_checklist_table(checklist_path=CHECKLIST_REL_PATH,
                               scraped_bool_col_=scraped_bool_col,
                               URL_col=url_col,
                               URL_of_newly_scraped=url_to_scrape,
                               bool_to_log_=bool_to_log)


# RUN

In [None]:
main()