22from concurrent import futures
33import multiprocessing as mp
44import os
5+ import json
56import uuid
67
8+ from bs4 import BeautifulSoup
9+ from markdown import markdown
710import requests
811import urllib3
912
1215urllib3 .disable_warnings (urllib3 .exceptions .InsecureRequestWarning )
1316
1417# Avoid rate limiting (tcp)
15- _URL_BOT_ID = 'Bot {id}' .format (id = str (uuid .uuid4 ()))
16- URL_HEADERS = {'User-Agent' : _URL_BOT_ID }
17- URL_TIMEOUT = 10.0
18-
19- # Sources of data (file)
20- IN_PATH = os .path .join (os .getcwd (), 'urlin.txt' )
21- OUT_PATH = os .path .join (os .getcwd (), 'urlout.txt' )
22-
23- # Collect repository URLs (bash)
24- _URL_RE = 'https?:\/\/[=a-zA-Z0-9\_\/\?\&\.\-]+' # proto://host+path+params
25- _FIND_URLS = "find . -type f | xargs grep -hEo '{regex}'" .format (regex = _URL_RE )
26- _FILTER_URLS = "sed '/Binary/d' | sort | uniq > {urlin}" .format (urlin = IN_PATH )
27- COMMAND = '{find} | {filter}' .format (find = _FIND_URLS , filter = _FILTER_URLS )
18+ URL_BOT_ID = f'Bot { str (uuid .uuid4 ())} '
19+
20+
21+ def extract_urls_from_html (content , all_urls ):
22+ soup = BeautifulSoup (content , 'html.parser' )
23+ for a in soup .find_all ('a' , href = True ):
24+ url = a ['href' ]
25+ if url .startswith ('http' ):
26+ all_urls .add (url )
27+
28+
29+ def extract_urls (discover_path ):
30+ exclude = ['.git' , '.vscode' ]
31+ all_urls = set ()
32+ max_strlen = - 1
33+ for root , dirs , files in os .walk (discover_path , topdown = True ):
34+ dirs [:] = [d for d in dirs if d not in exclude ]
35+ for file in files :
36+ output = f'Currently checking: file={ file } '
37+ file_path = os .path .join (root , file )
38+ if max_strlen < len (output ):
39+ max_strlen = len (output )
40+ print (output .ljust (max_strlen ), end = '\r ' )
41+ if file_path .endswith ('.html' ):
42+ content = open (file_path )
43+ extract_urls_from_html (content , all_urls )
44+ elif file_path .endswith ('.markdown' ):
45+ content = markdown (open (file_path ).read ())
46+ extract_urls_from_html (content , all_urls )
47+ return all_urls
2848
2949
3050def run_workers (work , data , worker_threads = mp .cpu_count ()* 4 ):
@@ -42,13 +62,15 @@ def get_url_status(url):
4262 clean_url = url .strip ('?.' )
4363 try :
4464 response = requests .get (
45- clean_url , verify = False , timeout = URL_TIMEOUT ,
46- headers = URL_HEADERS )
65+ clean_url , verify = False , timeout = 10.0 ,
66+ headers = { 'User-Agent' : URL_BOT_ID } )
4767 return (clean_url , response .status_code )
4868 except requests .exceptions .Timeout :
4969 return (clean_url , 504 )
5070 except requests .exceptions .ConnectionError :
5171 return (clean_url , - 1 )
72+ except requests .exceptions .TooManyRedirects :
73+ return (clean_url , - 1 )
5274
5375
5476def bad_url (url_status ):
@@ -65,22 +87,20 @@ def bad_url(url_status):
6587
6688def main ():
6789 print ('Extract urls...' )
68- os .system (COMMAND )
69- with open (IN_PATH , 'r' ) as fr :
70- urls = map (lambda l : l .strip ('\n ' ), fr .readlines ())
71- with open (OUT_PATH , 'w' ) as fw :
72- url_id = 1
73- max_strlen = - 1
74- for url_path , url_status in run_workers (get_url_status , urls ):
75- output = 'Currently checking: id={uid} host={uhost}' .format (
76- uid = url_id , uhost = urllib3 .util .parse_url (url_path ).host )
77- if max_strlen < len (output ):
78- max_strlen = len (output )
79- print (output .ljust (max_strlen ), end = '\r ' )
80- if bad_url (url_status ) is True :
81- fw .write ('{}: {}\n ' .format (url_path , url_status ))
82- url_id += 1
83- print ('\n Done.' )
90+ all_urls = extract_urls (os .getcwd ())
91+ print ('\n Check urls...' )
92+ bad_urls = {}
93+ url_id = 1
94+ max_strlen = - 1
95+ for url_path , url_status in run_workers (get_url_status , all_urls ):
96+ output = f'Currently checking: id={ url_id } host={ urllib3 .util .parse_url (url_path ).host } '
97+ if max_strlen < len (output ):
98+ max_strlen = len (output )
99+ print (output .ljust (max_strlen ), end = '\r ' )
100+ if bad_url (url_status ) is True :
101+ bad_urls [url_path ] = url_status
102+ url_id += 1
103+ print (f'\n Bad urls: { json .dumps (bad_urls , indent = 4 )} ' )
84104
85105
86106if __name__ == '__main__' :
0 commit comments