diff --git a/.gitignore b/.gitignore index 1491f9d..41d9268 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,6 @@ __pycache__/ # Distribution / packaging .Python env/ -bin/ build/ develop-eggs/ dist/ diff --git a/bin/storytracker-archive b/bin/storytracker-archive new file mode 100755 index 0000000..4a0e9e4 --- /dev/null +++ b/bin/storytracker-archive @@ -0,0 +1,68 @@ +#!/usr/bin/env python +import sys +import optparse +import storytracker + + +p = optparse.OptionParser( + description="Archive the HTML from the provided URLs", + usage="storytracker-archive [URL]... [OPTIONS]", +) + +p.add_option( + "--do-not-verify", + "-v", + action="store_false", + dest="verify", + default=True, + help="Skip verification that HTML is in the response's content-type header" +) + +p.add_option( + "--do-not-minify", + "-m", + action="store_false", + dest="minify", + default=True, + help="Skip minification of HTML response" +) + +p.add_option( + "--do-not-extend-urls", + "-e", + action="store_false", + dest="extend_urls", + default=True, + help="Do not extend relative urls discovered in the HTML response" +) + +p.add_option( + "--do-not-compress", + "-c", + action="store_false", + dest="compress", + default=True, + help="Skip compression of the HTML response" +) + +p.add_option( + "--output-dir", + "-d", + action="store", + type="string", + dest="output_dir", + default=None, + help="Provide a directory for the archived data to be stored" +) + +kwargs, args = p.parse_args() + +for a in args: + obj = storytracker.archive(a, **kwargs.__dict__) + if not kwargs.output_dir and obj: + if kwargs.compress: + sys.stdout.write(obj.gzip) + else: + sys.stdout.write(obj.html.encode("utf-8")) + elif kwargs.output_dir and obj: + sys.stdout.write(obj.archive_path) diff --git a/bin/storytracker-get b/bin/storytracker-get new file mode 100755 index 0000000..3eb0613 --- /dev/null +++ b/bin/storytracker-get @@ -0,0 +1,25 @@ +#!/usr/bin/env python +import sys +import optparse +import storytracker + + +p = optparse.OptionParser( + description="Retrieves HTML from the provided URLs", + usage="storytracker-get [URL]... [OPTIONS]", +) + +p.add_option( + "--do-not-verify", + "-v", + action="store_false", + dest="verify", + default=True, + help="Skip verification that HTML is in the response's content-type header" +) + +kwargs, args = p.parse_args() + +for a in args: + html = storytracker.get(a, verify=kwargs.verify) + sys.stdout.write(html.encode("utf-8")) diff --git a/bin/storytracker-links2csv b/bin/storytracker-links2csv new file mode 100755 index 0000000..73b736b --- /dev/null +++ b/bin/storytracker-links2csv @@ -0,0 +1,38 @@ +#!/usr/bin/env python +import sys +import csv +import six +import optparse +import storytracker + + +p = optparse.OptionParser( + description="Extracts hyperlinks from archived files or streams and \ +outputs them as comma-delimited values", + usage="storytracker-links2csv [ARCHIVE PATH]...", +) + +kwargs, args = p.parse_args() + +def prep(val): + if not val: + return None + else: + return six.text_type(val).encode("utf-8") + +for a in args: + obj = storytracker.open_archive_filepath(a) + f = six.BytesIO() + writer = csv.writer(f) + for h in obj.hyperlinks: + writer.writerow(map(prep, [ + obj.url, + obj.timestamp, + h.string, + h.domain, + h.href, + ])) + sys.stdout.write(f.getvalue()) + +# Allow directory input? +# Allow stream input? diff --git a/test.py b/test.py index 9bc3df7..fa21a9f 100644 --- a/test.py +++ b/test.py @@ -263,6 +263,11 @@ def test_archive_output_dir_html(self): self.assertTrue(os.path.exists(path2)) os.remove(path2) + def test_links2csv_filepath(self): + path = os.path.join(self.this_dir, "bin/storytracker-links2csv") + obj = storytracker.archive(self.url, output_dir=self.tmpdir) + cmd = "%s %s" % (path, obj.archive_path) + code, out, err = Command(cmd).run(timeout=3) if __name__ == '__main__': if six.PY3: