Skip to content
This repository has been archived by the owner on Dec 28, 2020. It is now read-only.

Commit

Permalink
Added images to CSV out. Fixes #34.
Browse files Browse the repository at this point in the history
  • Loading branch information
palewire committed Jul 28, 2014
1 parent b224403 commit 3404c81
Showing 1 changed file with 66 additions and 49 deletions.
115 changes: 66 additions & 49 deletions storytracker/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,32 +71,6 @@ def gzip(self):
f.write(self.html.encode("utf-8"))
return out.getvalue()

def write_gzip_to_directory(self, path):
"""
Writes gzipped HTML data to a file in the provided directory path
"""
if not os.path.isdir(path):
raise ValueError("Path must be a directory")
self.archive_path = os.path.join(path, "%s.gz" % self.archive_filename)
fileobj = open(self.archive_path, 'wb')
with gzip.GzipFile(fileobj=fileobj, mode="wb") as f:
f.write(self.html.encode("utf-8"))
return self.archive_path

def write_html_to_directory(self, path):
"""
Writes HTML data to a file in the provided directory path
"""
if not os.path.isdir(path):
raise ValueError("Path must be a directory")
self.archive_path = os.path.join(
path,
"%s.html" % self.archive_filename
)
with open(self.archive_path, 'wb') as f:
f.write(self.html.encode("utf-8"))
return self.archive_path

def get_hyperlinks(self, force=False):
"""
Parses all of the hyperlinks from the HTML and returns a list of
Expand Down Expand Up @@ -138,29 +112,6 @@ def get_hyperlinks(self, force=False):
return link_list
hyperlinks = property(get_hyperlinks)

def write_hyperlinks_csv_to_file(self, file):
"""
Returns the provided file object with a ready-to-serve CSV list of
all hyperlinks extracted from the HTML.
"""
writer = csv.writer(file)
headers = [
"archive_url",
"archive_timestamp",
"url_href",
"url_domain",
"url_string",
"url_index",
]
writer.writerow(headers)
for h in self.hyperlinks:
row = list(
map(six.text_type, [self.url, self.timestamp])
) + h.__csv__()
writer.writerow(row)
file.seek(0)
return file

def get_images(self, force=False):
"""
Parse the archived HTML for images and returns them as a list
Expand Down Expand Up @@ -194,6 +145,70 @@ def get_images(self, force=False):
return image_list
images = property(get_images)

def write_hyperlinks_csv_to_file(self, file):
"""
Returns the provided file object with a ready-to-serve CSV list of
all hyperlinks extracted from the HTML.
"""
# Create a CSV writer object out of the file
writer = csv.writer(file)

# Load up all the row
row_list = []
for h in self.hyperlinks:
row = list(
map(six.text_type, [self.url, self.timestamp])
) + h.__csv__()
row_list.append(row)

# Create the headers, which will change depending on how many
# images are found in the urls
headers = [
"archive_url",
"archive_timestamp",
"url_href",
"url_domain",
"url_string",
"url_index",
]
longest_row = max([len(r) for r in row_list])
for i in range(longest_row - len(headers)):
headers.append("image_%s_src" % (i + 1))

# Write it out to the file
writer.writerow(headers)
writer.writerows(row_list)

# Reboot the file and pass it back out
file.seek(0)
return file

def write_gzip_to_directory(self, path):
"""
Writes gzipped HTML data to a file in the provided directory path
"""
if not os.path.isdir(path):
raise ValueError("Path must be a directory")
self.archive_path = os.path.join(path, "%s.gz" % self.archive_filename)
fileobj = open(self.archive_path, 'wb')
with gzip.GzipFile(fileobj=fileobj, mode="wb") as f:
f.write(self.html.encode("utf-8"))
return self.archive_path

def write_html_to_directory(self, path):
"""
Writes HTML data to a file in the provided directory path
"""
if not os.path.isdir(path):
raise ValueError("Path must be a directory")
self.archive_path = os.path.join(
path,
"%s.html" % self.archive_filename
)
with open(self.archive_path, 'wb') as f:
f.write(self.html.encode("utf-8"))
return self.archive_path


class ArchivedURLSet(list):
"""
Expand Down Expand Up @@ -277,6 +292,8 @@ def __csv__(self):
self.string or '',
self.index,
]
for img in self.images:
row.append(img.src)
return list(map(six.text_type, row))


Expand Down

0 comments on commit 3404c81

Please sign in to comment.