Improve url detection

huangsam · huangsam · commit 938b50487ba0 · 2018-05-26T11:40:33.000-07:00
The previous implementation used Linux regular expressions. This was
sufficient as a MVP but it was not accurate enough for corner cases.
After doing some more research, it seemed as if using a HTML parsing
library would be more efficient for this purpose. As such, the shell
command has been scrapped away in favor of a more elaborate approach
for detecting urls.

- Implement skeleton for extract_urls
- Detect html and markdown files
- Use bs4 for parsing html
- Convert markdown for bs4 parsing
- Remove use of urlin.txt and urlout.txt
- Remove unnecessary global vars
diff --git a/.gitignore b/.gitignore
@@ -18,10 +18,6 @@ develop-eggs
 # Installer logs
 pip-log.txt
 
-# URL logs
-urlin.txt
-urlout.txt
-
 # Unit test / coverage reports
 .coverage
 .tox
diff --git a/check_urls.py b/check_urls.py
@@ -2,8 +2,11 @@
 from concurrent import futures
 import multiprocessing as mp
 import os
+import json
 import uuid
 
+from bs4 import BeautifulSoup
+from markdown import markdown
 import requests
 import urllib3
 
@@ -12,19 +15,36 @@
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
 # Avoid rate limiting (tcp)
-_URL_BOT_ID = 'Bot {id}'.format(id=str(uuid.uuid4()))
-URL_HEADERS = {'User-Agent': _URL_BOT_ID}
-URL_TIMEOUT = 10.0
-
-# Sources of data (file)
-IN_PATH = os.path.join(os.getcwd(), 'urlin.txt')
-OUT_PATH = os.path.join(os.getcwd(), 'urlout.txt')
-
-# Collect repository URLs (bash)
-_URL_RE = 'https?:\/\/[=a-zA-Z0-9\_\/\?\&\.\-]+'  # proto://host+path+params
-_FIND_URLS = "find . -type f | xargs grep -hEo '{regex}'".format(regex=_URL_RE)
-_FILTER_URLS = "sed '/Binary/d' | sort | uniq > {urlin}".format(urlin=IN_PATH)
-COMMAND = '{find} | {filter}'.format(find=_FIND_URLS, filter=_FILTER_URLS)
+URL_BOT_ID = f'Bot {str(uuid.uuid4())}'
+
+
+def extract_urls_from_html(content, all_urls):
+    soup = BeautifulSoup(content, 'html.parser')
+    for a in soup.find_all('a', href=True):
+        url = a['href']
+        if url.startswith('http'):
+            all_urls.add(url)
+
+
+def extract_urls(discover_path):
+    exclude = ['.git', '.vscode']
+    all_urls = set()
+    max_strlen = -1
+    for root, dirs, files in os.walk(discover_path, topdown=True):
+        dirs[:] = [d for d in dirs if d not in exclude]
+        for file in files:
+            output = f'Currently checking: file={file}'
+            file_path = os.path.join(root, file)
+            if max_strlen < len(output):
+                max_strlen = len(output)
+            print(output.ljust(max_strlen), end='\r')
+            if file_path.endswith('.html'):
+                content = open(file_path)
+                extract_urls_from_html(content, all_urls)
+            elif file_path.endswith('.markdown'):
+                content = markdown(open(file_path).read())
+                extract_urls_from_html(content, all_urls)
+    return all_urls
 
 
 def run_workers(work, data, worker_threads=mp.cpu_count()*4):
@@ -42,13 +62,15 @@ def get_url_status(url):
     clean_url = url.strip('?.')
     try:
         response = requests.get(
-            clean_url, verify=False, timeout=URL_TIMEOUT,
-            headers=URL_HEADERS)
+            clean_url, verify=False, timeout=10.0,
+            headers={'User-Agent': URL_BOT_ID})
         return (clean_url, response.status_code)
     except requests.exceptions.Timeout:
         return (clean_url, 504)
     except requests.exceptions.ConnectionError:
         return (clean_url, -1)
+    except requests.exceptions.TooManyRedirects:
+        return (clean_url, -1)
 
 
 def bad_url(url_status):
@@ -65,22 +87,20 @@ def bad_url(url_status):
 
 def main():
     print('Extract urls...')
-    os.system(COMMAND)
-    with open(IN_PATH, 'r') as fr:
-        urls = map(lambda l: l.strip('\n'), fr.readlines())
-    with open(OUT_PATH, 'w') as fw:
-        url_id = 1
-        max_strlen = -1
-        for url_path, url_status in run_workers(get_url_status, urls):
-            output = 'Currently checking: id={uid} host={uhost}'.format(
-                uid=url_id, uhost=urllib3.util.parse_url(url_path).host)
-            if max_strlen < len(output):
-                max_strlen = len(output)
-            print(output.ljust(max_strlen), end='\r')
-            if bad_url(url_status) is True:
-                fw.write('{}: {}\n'.format(url_path, url_status))
-            url_id += 1
-    print('\nDone.')
+    all_urls = extract_urls(os.getcwd())
+    print('\nCheck urls...')
+    bad_urls = {}
+    url_id = 1
+    max_strlen = -1
+    for url_path, url_status in run_workers(get_url_status, all_urls):
+        output = f'Currently checking: id={url_id} host={urllib3.util.parse_url(url_path).host}'
+        if max_strlen < len(output):
+            max_strlen = len(output)
+        print(output.ljust(max_strlen), end='\r')
+        if bad_url(url_status) is True:
+            bad_urls[url_path] = url_status
+        url_id += 1
+    print(f'\nBad urls: {json.dumps(bad_urls, indent=4)}')
 
 
 if __name__ == '__main__':