Skip to content
This repository has been archived by the owner on Dec 28, 2020. It is now read-only.

Commit

Permalink
Added index to Hyperlink. Fixes #31.
Browse files Browse the repository at this point in the history
  • Loading branch information
palewire committed Jul 21, 2014
1 parent eebf8e2 commit 00b5def
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 5 deletions.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@
'htmlmin==0.1.5',
'six==1.7.2',
'pytz>=2014.4',
'unicodecsv==0.9.4',
),
)
12 changes: 7 additions & 5 deletions storytracker/analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import six
import csv
import copy
import gzip
import unicodecsv
Expand Down Expand Up @@ -117,7 +116,7 @@ def get_hyperlinks(self, force=False):
# Loop through all <a> tags with href attributes
# and convert them to Hyperlink objects
link_list = []
for a in target.findAll("a", {"href": True}):
for i, a in enumerate(target.findAll("a", {"href": True})):
# Search out any images
images = []
for img in a.findAll("img", {"src": True}):
Expand All @@ -127,7 +126,7 @@ def get_hyperlinks(self, force=False):
except ValueError:
pass
# Create the Hyperlink object
hyperlink_obj = Hyperlink(a["href"], a.string, images)
hyperlink_obj = Hyperlink(a["href"], a.string, i, images)
# Add to the link list
link_list.append(hyperlink_obj)

Expand All @@ -138,7 +137,7 @@ def get_hyperlinks(self, force=False):

def write_hyperlinks_csv_to_file(self, file, encoding="utf-8"):
"""
Returns the provided file object with a ready-to-serve CSV list of
Returns the provided file object with a ready-to-serve CSV list of
all hyperlinks extracted from the HTML.
"""
writer = unicodecsv.writer(file, encoding=encoding)
Expand All @@ -148,6 +147,7 @@ def write_hyperlinks_csv_to_file(self, file, encoding="utf-8"):
"url_href",
"url_domain",
"url_string",
"url_index",
])
for h in self.hyperlinks:
row = map(six.text_type, [self.url, self.timestamp]) + h.__csv__()
Expand Down Expand Up @@ -228,9 +228,10 @@ class Hyperlink(UnicodeMixin):
"""
A hyperlink extracted from an archived URL.
"""
def __init__(self, href, string, images=[]):
def __init__(self, href, string, index, images=[]):
self.href = href
self.string = string
self.index = index
self.domain = urlparse(href).netloc
self.images = images

Expand Down Expand Up @@ -267,6 +268,7 @@ def __csv__(self):
self.href,
self.domain,
self.string or '',
self.index,
]
return map(six.text_type, row)

Expand Down
2 changes: 2 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def test_url_hyperlinks(self):
a.href
a.string
a.domain
a.index
if a.images:
for i in a.images:
self.assertTrue(isinstance(i, Image))
Expand All @@ -144,6 +145,7 @@ def test_url_hyperlinks(self):
a.__unicode__()
a.__str__()
a.__repr__()
a.__csv__()

def test_url_images(self):
obj = storytracker.archive(self.url)
Expand Down

0 comments on commit 00b5def

Please sign in to comment.