In [247]:
import urllib3
import json
import re
import sys
from time import time
import signal


## Read/download a website

In [243]:
def download_site(url, pool=None, decode=True):
    """
    Fetching the contents from url.
    args:
        url (str): The url!
        pool (urllib3.PoolManager): The pool that we use to fetch the url
            If it is "None", it will create a pool.
        decode (bool): If False, return the raw website (object).
                       If True, return the decoded page
    return:
        site content and a dict of extra information (elapsed time)
    """
    if pool is None:
        pool = urllib3.PoolManager()
    t1 = time()
    site = pool.request('GET', url)
    elapsed_time = time() - t1
    if decode:
        site = site.data.decode("utf-8")
    return site, {'fetching time (s)': elapsed_time}


def test_download_site():
    pool = urllib3.PoolManager()
    url = 'http://www.brainjar.com/java/host/test.html'
    content, info = download_site(url, pool, decode=False)
    assert content.status == 200
    assert len(content.data) > 0

test_download_site()

In [244]:
def match_site(url, regex_list, pool=None, unique=False):
    """
    Read an url and return matched patterns
    args:
        url (str): The url
        regex_list (list): a list contains regex strings
        pool (urllib3.PoolManager): The pool that we use to fetch the url
            If it is "None", it will create a pool.
        unique (bool): If true, it remove the matched redundancies
    return:
        a dict in which the keys are the regex strings and values are
            a list of matched values from the url.
        another dict of extra information
    """
    content, info = download_site(url, pool=pool, decode=True)
    matched = {}
    t1 = time()
    for regex in regex_list:
        matched[regex] = re.findall(regex, content)
    processing_time = time() - t1
    info['processing time (s)'] = processing_time
    return matched, info

def test_match_site():
    pool = urllib3.PoolManager()
    url = 'https://webscraper.io/test-sites/tables'
    matched, info = match_site(url, [r'@\w{10,}', r'\d{6,}'])
    assert (matched == {'@\\w{10,}': ['@webscraper', '@webscraper'], '\\d{6,}': ['604046']})

test_match_site()

In [274]:
def create_test_json(test_file):
    """
    Creating a sample json file for testing
    args:
        test_file (str): name of 
    """
    data = [{'https://webscraper.io/test-sites/tables': ['\\d{4,}', '@\\w+']}] * 100
    json.dump(data, open(test_file, 'w+'), indent=2)

    
def download_all(input_file, output_file, unique=False):
    global all_matched
    all_matched = []
    def signal_handler(sig, frame):
        print('Intrupt by user!')
        json.dump(all_matched, open(output_file, 'w+'), indent=2)

    signal.signal(signal.SIGINT, signal_handler)

    pool = urllib3.PoolManager()    
    for sample in json.load(open(input_file, 'r')):
        url, regex_list = list(sample.items())[0]
        matched, info = match_site(url, regex_list, pool=pool, unique=unique)
        matched = {url: matched}
        for key in info:
            matched[url][key] = info[key]
        all_matched.append(matched)
#     print(all_matched)
    json.dump(all_matched, open(output_file, 'w+'), indent=2)
    
    

create_test_json('test.json')
download_all('test.json', 'res.json')

Intrupt by user!
Intrupt by user!
