In [1]:
import pandas as pd
from os import path
import glob
from itertools import chain
import warnings
from scrapers.scraper import ScraperPool
from scrapers.afnor import AfnorScraper

# disable warnings, because pandas throws warnings for mixed type columns all the time..
warnings.filterwarnings('ignore')

In [2]:
workspace = '.\\data\\afnor'
write_dir = 'out'
file_found = 'found.csv'
file_extended = 'perinorm.csv'
col_found_id = 'Official identifier'
col_extended_id = 'Dokumentnummer'
search_for_document_families = True
search_batch_size = 500
num_threads = 10

In [3]:
# scraper settings
args = {
    'phantomjs_exec': 'C:\\phantomjs.exe',
    'save_to': '.\\data\\afnor\\out',
    'user': 'knut.blind@tu-berlin.de',
    'password': 'kblind',
    'urls': {
        'base': "https://sagaweb.afnor.org",
        'results': "/en-US/sw/Recherche/Resultat/1/?offset=10000&selectall=false",
        'logout': "/en-US/sw/Identification/Deconnexion"
    },
    'states': AfnorScraper.STATES_FIND_ALL
}

In [4]:
def refresh(target):
    print(">> refreshing...")
    inputs = glob.glob(path.join(workspace, write_dir, "*.csv"))
    inputs.append(path.join(workspace, file_found))
    df = pd.concat((pd.read_csv(f, sep=";", quotechar="\"", encoding="ISO-8859-1") for f in inputs), axis=0, ignore_index=True)
    found = set(df[col_found_id])
    searched_for = set(["_".join(".".join(path.basename(f).split(".")[:-1]).split("_")[:-1]).replace('#', '/').replace('=',':') for f in glob.glob(path.join(workspace, write_dir, "*.*"))])
    missing = (target - found) - searched_for
    print(">> target: " + str(len(target)) + ", found: " + str(len(found)) + ", missing: " + str(len(missing)))
    
    return target, found, missing

In [None]:
extended = pd.read_csv(path.join(workspace, file_extended), sep=";", quotechar="\"", encoding="ISO-8859-1")
target = set(chain.from_iterable([ref.split("*") for ref in extended[col_extended_id].tolist()]))
# manipulate keywords to search for document families
if search_for_document_families:
    target = set([k.split("/")[0].split("-")[0] for k in target])

In [None]:
_, found, missing = refresh(target)

while len(missing) > 0:
    
    # Always only look for first n missing standards, then refresh keyword-list.
    # This prevent searches for standards that have already been found while looking for other standards.
    keywords = list(missing)[:min(len(missing), search_batch_size)]
    

    
    # output
    first_next = min(len(keywords), 3)
    second_next = len(keywords) - first_next
    print(">> looking for: " + ", ".join(keywords[:first_next]) + " (" + str(second_next) + " more)")
    
    # run scrapers
    s = ScraperPool(AfnorScraper, keywords, args, chunk_size=round(len(keywords)/num_threads))
    s.run()
    
    _, found, missing = refresh(target)

>> refreshing...
>> target: 20532, found: 48428, missing: 6085
>> looking for: PR NF EN 1645, NF EN 1463, NF ISO 19229 (497 more)


connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
logging in
logging in
logging in
logging in
searching for NF EN 3682
searching for NF EN 50344
searching for PR NF EN 1645
searching for NF EN 62429
logging in
logging in
logging in
searching for NF EN ISO 3611
searching for NF EN 1463
searching for NF EN 28662
logging in
searching for NF ISO 19229
logging in
searching for NF EN 1611
logging in
searching for NF EN ISO 4674
logging in
searching for NF ISO 7190
logging in
logging in
searching for NF EN 62475
searching for NF EN ISO 25237
logging in
logging in
searching for NF EN ISO 3377
searching for NF EN ISO 6271
logging in
logging in
searching for NF S81
searching for NF V29
logging in
logging in
searching for PR NF EN 

logging in
searching for NF EN 459
logging in
searching for NF EN 353
logging in
searching for NF EN ISO 14174
logging in
searching for PR NF S76
logging in
searching for NF ISO 3070
logging in
searching for NF EN 13201
logging in
searching for NF EN 301490
logging in
logging in
searching for PR NF L46
searching for NF EN 4267
logging in
searching for NF EN ISO 7278
logging in
searching for NF ISO 23470
logging in
searching for NF EN ISO 10705
logging in
logging in
searching for NF EN ISO 2884
searching for PR NF EN 12814
logging in
searching for NF EN 12546
logging in
searching for NF ISO 9467
logging in
searching for NF EN ISO 14530
logging in
searching for NF EN ISO 9117
logging in
searching for NF EN 15705
logging in
logging in
searching for NF EN ISO 12006
searching for NF EN 123300
logging in
searching for NF ISO 8791
logging in
logging in
searching for PR NF EN 1459
searching for NF ISO 8613
logging in
searching for NF EN ISO 21415
logging in
searching for PR NF EN 14478
logging

logging in
searching for NF EN 12505+A1
logging in
searching for NF ISO 5414
logging in
searching for PR NF M40
logging in
searching for NF EN 12820
logging in
searching for PR NF EN ISO 5801
logging in
searching for PR NF EN ISO 20166
logging in
searching for NF DTU 25.41 P1
logging in
searching for PR NF ISO 12917
logging in
searching for NF P85
logging in
searching for NF EN 16604
logging in
searching for NF EN 1425
logging in
searching for PR NF EN ISO 13694
logging in
searching for PR NF D60
logging in
searching for NF EN ISO 7622
logging in
searching for NF ISO 6550
logging in
searching for NF EN 4626
logging in
searching for PR NF EN 15471
logging in
searching for NF EN 6037
logging in
searching for NF ISO 9220
logging in
searching for PR NF EN ISO 13260
logging in
searching for NF ISO 14401
logging in
searching for NF EN 14033
logging in
searching for NF EN 300402
logging in
searching for NF EN 62156
logging in
searching for NF ISO 1268
logging in
searching for NF ISO 8253
logg

>> refreshing...
>> target: 20532, found: 49452, missing: 5567
>> looking for: PR NF ISO 13373, NF EN 10220, PR NF EN 17106 (497 more)


connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
connection tested successfully!
logging in
logging in
logging in
logging in
searching for NF EN 4328
searching for NF EN 933
searching for NF EN ISO 1927
searching for PR NF ISO 13373
logging in
logging in
searching for NF EN 10220
searching for PR NF ISO 17179
logging in
searching for PR NF ISO 18095
logging in
searching for NF EN 50067
logging in
logging in
searching for NF R24
searching for NF EN 10253
logging in
searching for NF T23
logging in
logging in
searching for PR NF EN 17106
searching for PR NF EN 17088
logging in
logging in
searching for NF DTU 45.2
searching for PR NF C11
logging in
logging in
searching for PR NF U02
logging in
searching for NF G50
searching for PR NF C30
logging in
loggin

logging in
logging in
searching for NF EN ISO 2812
searching for NF EN ISO 14819
logging in
searching for NF EN ISO 712
logging in
searching for NF EN 3379
logging in
searching for NF G20
logging in
searching for NF EN 1176
logging in
searching for NF EN 30139
logging in
searching for NF EN 62416
logging in
searching for NF ISO 17512
logging in
searching for NF EN 50102
logging in
searching for NF EN 60358
logging in
searching for NF EN 12920
logging in
searching for NF U10
logging in
searching for NF EN ISO 13693
logging in
searching for PR NF EN ISO 16283
logging in
searching for PR NF Z71
logging in
logging in
searching for NF EN ISO 7536
searching for NF EN 4402
logging in
searching for NF ENV ISO 14253
logging in
searching for NF EN 3268
logging in
searching for PR NF EN 15004
logging in
searching for NF EN 23326
logging in
searching for NF EN 15224
logging in
searching for PR NF ISO 20507
logging in
logging in
searching for NF EN 16841
searching for NF ISO 2562
logging in
searchi