Skip to content

Commit

Permalink
Cloning non-wordpress sites (#26)
Browse files Browse the repository at this point in the history
* clone with relative links

* save files with query, add request timeout

* return get query as a file
  • Loading branch information
afeena authored and glaslos committed Jul 18, 2016
1 parent 3fd8b5f commit 7bb381e
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 10 deletions.
39 changes: 32 additions & 7 deletions clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,30 +49,50 @@ def replace_links(self, data, domain, urls):
soup = BeautifulSoup(data, 'html.parser')
patt = '.*' + domain + '.*'

# find all relative links
for link in soup.findAll(True, attrs={'href': re.compile('^((?!http|\/\/|\.\.).)*$')}):
if link['href'].startswith('/'):
link['href'] = link['href'][1:]
abs_link = 'http://' + domain + link['href']
urls.append(abs_link)

# find all absolute links
for link in soup.findAll(True, attrs={'href': re.compile(patt)}):
urls.append(link['href'])
link['href'] = self.make_new_link(link['href'])

# find all images and scripts
for elem in soup.findAll(True, attrs={'src': re.compile('^((?!http|\/\/|\.\.).)*$')}):
abs_link = 'http://' + domain + elem['src']
urls.append(abs_link)

# find all action elements
for act_link in soup.findAll(True, attrs={'action': re.compile(patt)}):
urls.append(act_link['action'])
act_link['action'] = self.make_new_link(act_link['action'])
urls = list(set(urls))
return soup

@asyncio.coroutine
def get_body(self, root_url, urls, visited_urls):
visited_urls.append(root_url)
if not root_url.startswith("http"):
root_url = 'http://' + root_url
visited_urls.append(root_url)
parsed_url = urlparse(root_url)
if parsed_url.fragment:
return
domain = parsed_url.netloc
if not domain.endswith('/'):
domain += '/'
file_name = self.make_new_link(root_url)

file_path = ''
patt = '/.*/.*\.'
if re.match(patt, file_name):
file_path, file_name = file_name.rsplit('/', 1)
file_path += '/'
if parsed_url.query:
file_name += '?' + parsed_url.query
print('path: ', file_path, 'name: ', file_name)
if len(domain) < 4:
sys.exit('invalid taget {}'.format(root_url))
Expand All @@ -85,13 +105,15 @@ def get_body(self, root_url, urls, visited_urls):

data = None
try:
with aiohttp.ClientSession() as session:
response = yield from session.get(root_url)
data = yield from response.read()
session.close()
with aiohttp.Timeout(10.0):
with aiohttp.ClientSession() as session:
response = yield from session.get(root_url)
data = yield from response.read()
except Exception as e:
print(e)

else:
response.release()
session.close()
if data is not None:
if '.html' in file_name:
soup = self.replace_links(data, domain, urls)
Expand All @@ -105,7 +127,10 @@ def get_body(self, root_url, urls, visited_urls):
continue
carved_url = os.path.normpath(os.path.join(domain, carved_url))
if not carved_url.startswith('http'):
carved_url = 'http://' + carved_url
if carved_url.startswith('..') or carved_url.startswith('/'):
carved_url = 'http://' + domain + carved_url
else:
carved_url = 'http://' + carved_url
if carved_url not in visited_urls:
urls.insert(0, carved_url)
for url in urls:
Expand Down
15 changes: 12 additions & 3 deletions snare.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,20 +194,29 @@ def handle_request(self, request, payload):
content = payload_content.encode('utf-8')
else:
base_path = '/'.join(['/opt/snare/pages', self.run_args.page_dir])
query = None
if request.path == '/':
parsed_url = self.run_args.index_page
else:
parsed_url = urlparse(unquote(request.path)).path
parsed_url = urlparse(unquote(request.path))
if parsed_url.query:
query = '?' + parsed_url.query
parsed_url = parsed_url.path
if parsed_url.startswith('/'):
parsed_url = parsed_url[1:]
path = '/'.join(
[base_path, parsed_url]
)
path = os.path.normpath(path)
content_type = mimetypes.guess_type(path)[0]
if content_type is None and '.php' in path:
content_type = 'text/html'
if query is not None:
path = os.path.normpath(path + query)
else:
path = os.path.normpath(path)
if os.path.isfile(path) and path.startswith(base_path):
with open(path, 'rb') as fh:
content = fh.read()
content_type = mimetypes.guess_type(path)[0]
if content_type:
if 'text/html' in content_type:
content = yield from self.handle_html_content(content)
Expand Down

0 comments on commit 7bb381e

Please sign in to comment.