|
| 1 | +#!/usr/bin/env python |
| 2 | +from concurrent import futures |
| 3 | +import multiprocessing as mp |
| 4 | +import os |
| 5 | +import uuid |
| 6 | + |
| 7 | +import requests |
| 8 | +import urllib3 |
| 9 | + |
| 10 | + |
| 11 | +# Ignore security hazard since certs SHOULD be trusted (https) |
| 12 | +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) |
| 13 | + |
| 14 | +# Avoid rate limiting (tcp) |
| 15 | +_URL_BOT_ID = 'Bot {id}'.format(id=str(uuid.uuid4())) |
| 16 | +URL_HEADERS = {'User-Agent': _URL_BOT_ID} |
| 17 | +URL_TIMEOUT = 10.0 |
| 18 | + |
| 19 | +# Sources of data (file) |
| 20 | +IN_PATH = os.path.join(os.getcwd(), 'urlin.txt') |
| 21 | +OUT_PATH = os.path.join(os.getcwd(), 'urlout.txt') |
| 22 | + |
| 23 | +# Collect repository URLs (bash) |
| 24 | +_URL_RE = 'https?:\/\/[=a-zA-Z0-9\_\/\?\&\.\-]+' # proto://host+path+params |
| 25 | +_FIND_URLS = "find . -type f | xargs grep -hEo '{regex}'".format(regex=_URL_RE) |
| 26 | +_FILTER_URLS = "sed '/Binary/d' | sort | uniq > {urlin}".format(urlin=IN_PATH) |
| 27 | +COMMAND = '{find} | {filter}'.format(find=_FIND_URLS, filter=_FILTER_URLS) |
| 28 | + |
| 29 | + |
| 30 | +def run_workers(work, data, worker_threads=mp.cpu_count()*4): |
| 31 | + with futures.ThreadPoolExecutor(max_workers=worker_threads) as executor: |
| 32 | + future_to_result = { |
| 33 | + executor.submit(work, arg): arg for arg in data} |
| 34 | + for future in futures.as_completed(future_to_result): |
| 35 | + yield future.result() |
| 36 | + |
| 37 | + |
| 38 | +def get_url_status(url): |
| 39 | + for local in ('localhost', '127.0.0.1', 'app_server'): |
| 40 | + if url.startswith('http://' + local): |
| 41 | + return (url, 0) |
| 42 | + clean_url = url.strip('?.') |
| 43 | + try: |
| 44 | + response = requests.get( |
| 45 | + clean_url, verify=False, timeout=URL_TIMEOUT, |
| 46 | + headers=URL_HEADERS) |
| 47 | + return (clean_url, response.status_code) |
| 48 | + except requests.exceptions.Timeout: |
| 49 | + return (clean_url, 504) |
| 50 | + except requests.exceptions.ConnectionError: |
| 51 | + return (clean_url, -1) |
| 52 | + |
| 53 | + |
| 54 | +def bad_url(url_status): |
| 55 | + if url_status == -1: |
| 56 | + return True |
| 57 | + elif url_status == 401 or url_status == 403: |
| 58 | + return False |
| 59 | + elif url_status == 503: |
| 60 | + return False |
| 61 | + elif url_status >= 400: |
| 62 | + return True |
| 63 | + return False |
| 64 | + |
| 65 | + |
| 66 | +def main(): |
| 67 | + print('Extract urls...') |
| 68 | + os.system(COMMAND) |
| 69 | + with open(IN_PATH, 'r') as fr: |
| 70 | + urls = map(lambda l: l.strip('\n'), fr.readlines()) |
| 71 | + with open(OUT_PATH, 'w') as fw: |
| 72 | + url_id = 1 |
| 73 | + max_strlen = -1 |
| 74 | + for url_path, url_status in run_workers(get_url_status, urls): |
| 75 | + output = 'Currently checking: id={uid} host={uhost}'.format( |
| 76 | + uid=url_id, uhost=urllib3.util.parse_url(url_path).host) |
| 77 | + if max_strlen < len(output): |
| 78 | + max_strlen = len(output) |
| 79 | + print(output.ljust(max_strlen), end='\r') |
| 80 | + if bad_url(url_status) is True: |
| 81 | + fw.write('{}: {}\n'.format(url_path, url_status)) |
| 82 | + url_id += 1 |
| 83 | + print('\nDone.') |
| 84 | + |
| 85 | + |
| 86 | +if __name__ == '__main__': |
| 87 | + main() |
0 commit comments