Skip to content
This repository has been archived by the owner on Dec 28, 2020. It is now read-only.

Commit

Permalink
First draft of a CLI that will strain links out to CSV
Browse files Browse the repository at this point in the history
  • Loading branch information
palewire committed Jul 21, 2014
1 parent 1bb44a8 commit 16d3742
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 1 deletion.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ __pycache__/
# Distribution / packaging
.Python
env/
bin/
build/
develop-eggs/
dist/
Expand Down
68 changes: 68 additions & 0 deletions bin/storytracker-archive
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env python
import sys
import optparse
import storytracker


p = optparse.OptionParser(
description="Archive the HTML from the provided URLs",
usage="storytracker-archive [URL]... [OPTIONS]",
)

p.add_option(
"--do-not-verify",
"-v",
action="store_false",
dest="verify",
default=True,
help="Skip verification that HTML is in the response's content-type header"
)

p.add_option(
"--do-not-minify",
"-m",
action="store_false",
dest="minify",
default=True,
help="Skip minification of HTML response"
)

p.add_option(
"--do-not-extend-urls",
"-e",
action="store_false",
dest="extend_urls",
default=True,
help="Do not extend relative urls discovered in the HTML response"
)

p.add_option(
"--do-not-compress",
"-c",
action="store_false",
dest="compress",
default=True,
help="Skip compression of the HTML response"
)

p.add_option(
"--output-dir",
"-d",
action="store",
type="string",
dest="output_dir",
default=None,
help="Provide a directory for the archived data to be stored"
)

kwargs, args = p.parse_args()

for a in args:
obj = storytracker.archive(a, **kwargs.__dict__)
if not kwargs.output_dir and obj:
if kwargs.compress:
sys.stdout.write(obj.gzip)
else:
sys.stdout.write(obj.html.encode("utf-8"))
elif kwargs.output_dir and obj:
sys.stdout.write(obj.archive_path)
25 changes: 25 additions & 0 deletions bin/storytracker-get
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env python
import sys
import optparse
import storytracker


p = optparse.OptionParser(
description="Retrieves HTML from the provided URLs",
usage="storytracker-get [URL]... [OPTIONS]",
)

p.add_option(
"--do-not-verify",
"-v",
action="store_false",
dest="verify",
default=True,
help="Skip verification that HTML is in the response's content-type header"
)

kwargs, args = p.parse_args()

for a in args:
html = storytracker.get(a, verify=kwargs.verify)
sys.stdout.write(html.encode("utf-8"))
38 changes: 38 additions & 0 deletions bin/storytracker-links2csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python
import sys
import csv
import six
import optparse
import storytracker


p = optparse.OptionParser(
description="Extracts hyperlinks from archived files or streams and \
outputs them as comma-delimited values",
usage="storytracker-links2csv [ARCHIVE PATH]...",
)

kwargs, args = p.parse_args()

def prep(val):
if not val:
return None
else:
return six.text_type(val).encode("utf-8")

for a in args:
obj = storytracker.open_archive_filepath(a)
f = six.BytesIO()
writer = csv.writer(f)
for h in obj.hyperlinks:
writer.writerow(map(prep, [
obj.url,
obj.timestamp,
h.string,
h.domain,
h.href,
]))
sys.stdout.write(f.getvalue())

# Allow directory input?
# Allow stream input?
5 changes: 5 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ def test_archive_output_dir_html(self):
self.assertTrue(os.path.exists(path2))
os.remove(path2)

def test_links2csv_filepath(self):
path = os.path.join(self.this_dir, "bin/storytracker-links2csv")
obj = storytracker.archive(self.url, output_dir=self.tmpdir)
cmd = "%s %s" % (path, obj.archive_path)
code, out, err = Command(cmd).run(timeout=3)

if __name__ == '__main__':
if six.PY3:
Expand Down

0 comments on commit 16d3742

Please sign in to comment.