Skip to content

Commit b56bb5e

Browse files
committed
Add health check for urls
- Add url collection algorithm - Optimize regex + config for clarity - Handle exceptions in get_url_status
1 parent 4590557 commit b56bb5e

File tree

3 files changed

+93
-0
lines changed

3 files changed

+93
-0
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ develop-eggs
1717
# Installer logs
1818
pip-log.txt
1919

20+
# URL logs
21+
urlin.txt
22+
urlout.txt
23+
2024
# Unit test / coverage reports
2125
.coverage
2226
.tox

check_urls.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/usr/bin/env python
2+
from concurrent import futures
3+
import multiprocessing as mp
4+
import os
5+
import uuid
6+
7+
import requests
8+
import urllib3
9+
10+
11+
# Ignore security hazard since certs SHOULD be trusted (https)
12+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
13+
14+
# Avoid rate limiting (tcp)
15+
_URL_BOT_ID = 'Bot {id}'.format(id=str(uuid.uuid4()))
16+
URL_HEADERS = {'User-Agent': _URL_BOT_ID}
17+
URL_TIMEOUT = 10.0
18+
19+
# Sources of data (file)
20+
IN_PATH = os.path.join(os.getcwd(), 'urlin.txt')
21+
OUT_PATH = os.path.join(os.getcwd(), 'urlout.txt')
22+
23+
# Collect repository URLs (bash)
24+
_URL_RE = 'https?:\/\/[=a-zA-Z0-9\_\/\?\&\.\-]+' # proto://host+path+params
25+
_FIND_URLS = "find . -type f | xargs grep -hEo '{regex}'".format(regex=_URL_RE)
26+
_FILTER_URLS = "sed '/Binary/d' | sort | uniq > {urlin}".format(urlin=IN_PATH)
27+
COMMAND = '{find} | {filter}'.format(find=_FIND_URLS, filter=_FILTER_URLS)
28+
29+
30+
def run_workers(work, data, worker_threads=mp.cpu_count()*4):
31+
with futures.ThreadPoolExecutor(max_workers=worker_threads) as executor:
32+
future_to_result = {
33+
executor.submit(work, arg): arg for arg in data}
34+
for future in futures.as_completed(future_to_result):
35+
yield future.result()
36+
37+
38+
def get_url_status(url):
39+
for local in ('localhost', '127.0.0.1', 'app_server'):
40+
if url.startswith('http://' + local):
41+
return (url, 0)
42+
clean_url = url.strip('?.')
43+
try:
44+
response = requests.get(
45+
clean_url, verify=False, timeout=URL_TIMEOUT,
46+
headers=URL_HEADERS)
47+
return (clean_url, response.status_code)
48+
except requests.exceptions.Timeout:
49+
return (clean_url, 504)
50+
except requests.exceptions.ConnectionError:
51+
return (clean_url, -1)
52+
53+
54+
def bad_url(url_status):
55+
if url_status == -1:
56+
return True
57+
elif url_status == 401 or url_status == 403:
58+
return False
59+
elif url_status == 503:
60+
return False
61+
elif url_status >= 400:
62+
return True
63+
return False
64+
65+
66+
def main():
67+
print('Extract urls...')
68+
os.system(COMMAND)
69+
with open(IN_PATH, 'r') as fr:
70+
urls = map(lambda l: l.strip('\n'), fr.readlines())
71+
with open(OUT_PATH, 'w') as fw:
72+
url_id = 1
73+
max_strlen = -1
74+
for url_path, url_status in run_workers(get_url_status, urls):
75+
output = 'Currently checking: id={uid} host={uhost}'.format(
76+
uid=url_id, uhost=urllib3.util.parse_url(url_path).host)
77+
if max_strlen < len(output):
78+
max_strlen = len(output)
79+
print(output.ljust(max_strlen), end='\r')
80+
if bad_url(url_status) is True:
81+
fw.write('{}: {}\n'.format(url_path, url_status))
82+
url_id += 1
83+
print('\nDone.')
84+
85+
86+
if __name__ == '__main__':
87+
main()

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
boto3==1.4.4
22
Markdown==2.6.10
33
Pelican==3.7.1
4+
requests==2.18.4
5+
urllib3==1.22

0 commit comments

Comments
 (0)