In [2]:
import concurrent.futures
import mongo_storage
import pandas as pd
import scrape

In [3]:
census_data = pd.read_csv('./data/FMCSA_CENSUS1_2020Aug.txt', encoding = "ISO-8859-1")

In [4]:
dot_numbers = census_data.DOT_NUMBER.to_list()
remain = len(dot_numbers) - mongo_storage.getNextIndex()

In [5]:
def chunkify(lst, chunk_size=500):
    for x in range(0, len(lst), chunk_size):
        yield lst[x:x + chunk_size]

In [6]:
def get_and_store(items, index):
    o = []
    for _index, value in enumerate(items):
        try:
            val = scrape.get_carrier_registration(value)
        except Exception as e:
            val = {'carrier_id': value, 'failed': True, 'error': e, }
        val['index'] = _index + index
        o.append(val)
    with mongo_storage.GetClient() as cli:
        cli.insert_many(o)


In [9]:
mongo_storage.getNextIndex()

126551

In [8]:
def exp(values, quantity=5000):
    index = mongo_storage.getNextIndex()
    vals = values[index:index+quantity]
    with concurrent.futures.ProcessPoolExecutor(max_workers=16) as executor:
        for items in chunkify(vals, chunk_size=100):
            executor.submit(get_and_store, items, index)
            index += len(items)

In [10]:
%%timeit -n1 -r1
exp(dot_numbers, 10000)

1min 31s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [13]:
%%timeit -n1 -r1
exp(dot_numbers, 5000)

47.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
exp(dot_numbers, remain)