In [1]:
import pandas as pd
from os import path
import glob
from itertools import chain
import warnings
from scrapers.scraper import ScraperPool
from scrapers.afnor import AfnorScraper

# disable warnings, because pandas throws warnings for mixed type columns all the time..
warnings.filterwarnings('ignore')

In [2]:
workspace = '.\\data\\afnor'
write_dir = 'out'
file_found = 'found.csv'
file_extended = 'perinorm.csv'
col_found_id = 'Official identifier'
col_extended_id = 'Dokumentnummer'
search_for_document_families = True
search_batch_size = 500
num_threads = 10

In [3]:
# scraper settings
args = {
    'phantomjs_exec': 'C:\\phantomjs.exe',
    'save_to': '.\\data\\afnor\\out',
    'user': 'knut.blind@tu-berlin.de',
    'password': 'kblind',
    'urls': {
        'base': "https://sagaweb.afnor.org",
        'results': "/en-US/sw/Recherche/Resultat/1/?offset=10000&selectall=false",
        'logout': "/en-US/sw/Identification/Deconnexion"
    },
    'states': AfnorScraper.STATES_FIND_ALL
}

In [4]:
def refresh(target):
    print(">> refreshing...")
    inputs = glob.glob(path.join(workspace, write_dir, "*.csv"))
    inputs.append(path.join(workspace, file_found))
    df = pd.concat((pd.read_csv(f, sep=";", quotechar="\"", encoding="ISO-8859-1") for f in inputs), axis=0, ignore_index=True)
    found = set(df[col_found_id])
    searched_for = set(["_".join(".".join(path.basename(f).split(".")[:-1]).split("_")[:-1]).replace('#', '/').replace('=',':') for f in glob.glob(path.join(workspace, write_dir, "*.*"))])
    missing = (target - found) - searched_for
    print(">> target: " + str(len(target)) + ", found: " + str(len(found)) + ", missing: " + str(len(missing)))
    
    return target, found, missing

In [5]:
extended = pd.read_csv(path.join(workspace, file_extended), sep=";", quotechar="\"", encoding="ISO-8859-1")
target = set(chain.from_iterable([ref.split("*") for ref in extended[col_extended_id].tolist()]))

In [None]:
_, found, missing = refresh(target)

while len(missing) > 0:
    
    # Always only look for first n missing standards, then refresh keyword-list.
    # This prevent searches for standards that have already been found while looking for other standards.
    keywords = list(missing)[:min(len(missing), search_batch_size)]
    
    # manipulate keywords to search for document families
    if search_for_document_families:
        keywords = [k.split("/")[0].split("-")[0] for k in keywords]
    
    # output
    first_next = min(len(keywords), 3)
    second_next = len(keywords) - first_next
    print(">> looking for: " + ", ".join(keywords[:first_next]) + " (" + str(second_next) + " more)")
    
    # run scrapers
    s = ScraperPool(AfnorScraper, keywords, args, chunk_size=round(len(keywords)/num_threads))
    s.run()
    
    _, found, missing = refresh(target)

>> refreshing...
>> target: 89226, found: 32860, missing: 39528
>> looking for: PR NF C15, NF C18, NF EN 50272 (497 more)


connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
logging in
logging in
logging in
logging in
searching for NF C61
searching for PR NF C15
searching for NF E86
searching for NF EN 28733
logging in
logging in
logging in
searching for NF F01
searching for NF EN 15939
searching for NF X35
logging in
searching for PR NF C73
logging in
logging in
searching for NF C93
searching for NF X41
