In [15]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import pandas as pd
from os import path
import glob
from itertools import chain
import warnings
from scrapers.scrapepool import ScrapePool
from scrapers.afnor import AfnorScraper

# disable warnings, because pandas throws warnings for mixed type columns all the time..
warnings.filterwarnings('ignore')

%autoreload

In [17]:
workspace = '.\\data\\afnor'
write_dir = 'out'
file_found = 'found.csv'
file_extended = 'perinorm.csv'
col_found_id = 'Official identifier'
col_extended_id = 'Dokumentnummer'
search_for_document_families = True
search_batch_size = 500
num_threads = 10

In [18]:
# scraper settings
args = {
    'phantomjs_exec': 'C:\\phantomjs.exe',
    'save_to': '.\\data\\afnor\\out',
    'user': 'knut.blind@tu-berlin.de',
    'password': 'kblind',
    'urls': {
        'base': "https://sagaweb.afnor.org",
        'results': "/en-US/sw/Recherche/Resultat/1/?offset=10000&selectall=false",
        'logout': "/en-US/sw/Identification/Deconnexion"
    },
    'states': AfnorScraper.STATES_FIND_ALL
}

In [19]:
def refresh(target):
    print(">> refreshing...")
    inputs = glob.glob(path.join(workspace, write_dir, "*.csv"))
    inputs.append(path.join(workspace, file_found))
    df = pd.concat((pd.read_csv(f, sep=";", quotechar="\"", encoding="ISO-8859-1") for f in inputs), axis=0, ignore_index=True)
    found = set(df[col_found_id])
    searched_for = set(["_".join(".".join(path.basename(f).split(".")[:-1]).split("_")[:-1]).replace('#', '/').replace('=',':') for f in glob.glob(path.join(workspace, write_dir, "*.*"))])
    missing = (target - found) - searched_for
    print(">> target: " + str(len(target)) + ", found: " + str(len(found)) + ", missing: " + str(len(missing)))
    
    return target, found, missing

In [None]:
extended = pd.read_csv(path.join(workspace, file_extended), sep=";", quotechar="\"", encoding="ISO-8859-1")
target = set(chain.from_iterable([ref.split("*") for ref in extended[col_extended_id].tolist()]))
# manipulate keywords to search for document families
if search_for_document_families:
    target = set([k.split("/")[0].split("-")[0] for k in target])

In [None]:
_, found, missing = refresh(target)

last_missing = None

# make sure we're not repeatedly looking for the same thing
while len(missing) > 0 and last_missing != missing:
    
    last_missing = set(missing)
    
    # Always only look for first n missing standards, then refresh keyword-list.
    # This prevent searches for standards that have already been found while looking for other standards.
    keywords = list(missing)[:min(len(missing), search_batch_size)]
    
    # output
    first_next = min(len(keywords), 3)
    second_next = len(keywords) - first_next
    print(">> looking for: " + ", ".join(keywords[:first_next]) + " (" + str(second_next) + " more)")
    
    # run scrapers
    s = ScrapePool(AfnorScraper, keywords, args, chunk_size=round(len(keywords)/num_threads))
    s.run()
    
    _, found, missing = refresh(target)

>> refreshing...
>> target: 20532, found: 58542, missing: 918
>> looking for: NF EN ISO 22434, NF EN 54, NF EN ISO 10536 (497 more)


connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
logging in
logging in
logging in
logging in
searching for NF EN ISO 6974
searching for NF EN ISP 11190
searching for NF EN 15436
searching for NF EN ISO 22434
logging in
logging in
searching for NF ISO 17586
searching for NF EN 54
logging in
searching for NF EN 12412
logging in
searching for NF L01
logging in
searching for NF EN 60931
logging in
searching for NF EN 15878
logging in
logging in
searching for PR NF EN 17054
searching for PR NF EN 13374
logging in
logging in
searching for NF EN 300185
searching for NF ENV 12836
logging in
logging in
searching for NF ISO 8917
searching for NF EN 26599
logging in
searching for PR NF EN ISO 10139
logging in
searching for NF EN I

searching for PR NF EN 15746
logging in
logging in
searching for NF U41
searching for NF EN 61918
logging in
searching for NF EN 1452
logging in
searching for NF EN ISO 7093
logging in
searching for NF EN 60598
logging in
searching for NF EN 4137
logging in
logging in
searching for NF EN ISO 14405
searching for NF ISO 2248
logging in
searching for NF EN 2484
logging in
searching for PR NF EN 16719
logging in
searching for NF EN 1322
logging in
searching for NF EN 60953
logging in
logging in
searching for NF V21
searching for NF ISO 3462
logging in
searching for NF ISO 16290
logging in
logging in
searching for NF ISO 13381
logging in
searching for PR NF E29
searching for PR NF EN 16777
logging in
logging in
searching for NF EN 301001
searching for NF ISO 7641
logging in
searching for NF EN 60687
logging in
logging in
searching for NF Q32
searching for NF EN 14775
logging in
searching for PR NF ISO 16254
logging in
searching for NF EN ISO 4126
logging in
searching for NF EN 844
logging i

logging in
logging in
searching for NF R19
searching for NF EN 14636
logging in
searching for NF EN ISO 9233
logging in
searching for NF ISO 3408
logging in
searching for NF EN 15583
logging in
searching for NF ISO 7982
logging in
logging in
searching for NF EN 13321
logging in
searching for NF EN ISO 17636
searching for NF EN ISO 12732
logging in
searching for NF EN 13966
logging in
logging in
searching for NF EN ISO 4287
searching for NF DTU 36.5 P1
logging in
logging in
searching for PR NF S72
searching for PR NF EN 12807
logging in
searching for NF S82
logging in
searching for NF EN ISO 1989
logging in
searching for NF EN 4462
logging in
searching for NF EN ISO 15216
logging in
logging in
searching for NF ISO 7663
searching for NF EN ISO 16634
logging in
searching for NF ISO 23830
logging in
searching for NF ISO 5389
logging in
searching for NF EN 62044
logging in
searching for NF EN 26591
logging in
searching for PR NF EN 716
logging in
searching for NF ISO 16733
logging in
search

In [None]:
extended = pd.read_csv(path.join(workspace, file_extended), sep=";", quotechar="\"", encoding="ISO-8859-1")
target = set(chain.from_iterable([ref.split("*") for ref in extended[col_extended_id].tolist()]))

In [None]:
_, found, missing = refresh(target)

last_missing = None

# make sure we're not repeatedly looking for the same thing
while len(missing) > 0 and last_missing != missing:
    
    last_missing = set(missing)
    
    # Always only look for first n missing standards, then refresh keyword-list.
    # This prevent searches for standards that have already been found while looking for other standards.
    keywords = list(missing)[:min(len(missing), search_batch_size)]
    
    # output
    first_next = min(len(keywords), 3)
    second_next = len(keywords) - first_next
    print(">> looking for: " + ", ".join(keywords[:first_next]) + " (" + str(second_next) + " more)")
    
    # run scrapers
    s = ScrapePool(AfnorScraper, keywords, args, chunk_size=round(len(keywords)/num_threads))
    s.run()
    
    _, found, missing = refresh(target)