# Peteris Krumins (
# -- good coders code, great reuse
# Released under GNU GPL license.
# Developed as a part of hacker top program. Read how it was designed:
import re
import sys
import time
import socket
import urllib2
import datetime
from urlparse import urljoin
from BeautifulSoup import BeautifulSoup
version = "1.0"
hacker_url = ''
hacker_url_new = ''
class RedesignError(Exception):
An exception class thrown when it seems that Hacker News has redesigned
class SeriousError(Exception):
An exception class thrown when something unexpected happened
class Story(dict):
Encapsulates the information about a single Hacker News story.
After the object is constructed it contains the following attributes:
* position
* id
* title
* url
* user
* score
* human_time
* unix_time
* comments
def __repr__(self):
inner = ', '.join([repr(x) for x in ((self.position), str(, str(self.title),
str(self.url), str(self.user), str(self.score), str(self.human_time),
str(self.unix_time), str(self.comments))])
return ''.join(('{', inner, '}'))
def stories_per_page():
""" Returns stories per single web page """
return 30
def get_stories(pages=1, new=False):
Finds all stories accross 'pages' pages and returns a list of Story objects
representing stories.
If new is True, gets new stories from
stories = []
url = hacker_url
if new: url = hacker_url_new
for i in range(pages):
content = _get_page(url)
entries = _extract_stories(content)
url = _get_next_page(content)
if not url:
for pos, story in enumerate(stories):
story.position = pos+1
return stories;
def _extract_stories(content):
Given an HTML page, extracts all the stories and returns a list of Story
objects representing stories.
stories = []
soup = BeautifulSoup(content)
def mk_tag_finder(name, klass, attrs):
def td_finder(tag):
if != name: return False
if len(tag.attrs) != attrs: return False
# if 'class' not in tag: return False ### won't work
if tag['class'] == klass: return True
except KeyError:
return False
return td_finder
title_tds = soup.findAll(mk_tag_finder('td', 'title', 1))
vote_as = soup.findAll('a', id=re.compile(r'up_\d+'))
subtext_tds = soup.findAll(mk_tag_finder('td', 'subtext', 1))
#if len(title_tds) != len(subtext_tds) != len(vote_as):
#raise RedesignError, "lengths of title, vote and subtext lists do not match"
for title_td, vote_a, subtext_td in zip(title_tds, vote_as, subtext_tds):
title_a = title_td.find('a')
if not title_a:
raise RedesignError, "title <a> was not found"
title = title_a.string.strip()
url = title_a['href']
if url.startswith('item'): # link to the story itself
url = hacker_url + '/' + url
m ='up_(\d+)', vote_a['id'])
if not m:
raise RedesignError, "title id did not contain story id"
id =
score_span = subtext_td.find('span', id=re.compile(r'score_(\d+)'))
if not score_span:
raise RedesignError, "could not find <span> containing score"
m ='(\d+) point', score_span.string)
if not m:
raise RedesignError, "unable to extract score"
score = int(
user_a = subtext_td.find('a', href=re.compile(r'^user'))
if not user_a:
raise RedesignError, "unable to find <a> containing username"
user = user_a.string
posted_re = re.compile(r'\s+(.+)\s+ago')
posted_text = subtext_td.find(text = posted_re)
if not posted_text:
raise RedesignError, "could not find posted ago text"
m =;
posted_ago =
unix_time = _ago_to_unix(posted_ago)
if not unix_time:
raise RedesignError, "unable to extract story date"
human_time = time.ctime(unix_time)
comment_a = subtext_td.find('a', href=re.compile(r'^item'))
if not comment_a:
comments = -1
elif comment_a.string == "discuss":
comments = 0
m ='(\d+) comment', comment_a.string)
if not m:
raise RedesignError, "could not extract comment count"
comments = int(
subtext_urls = subtext_td.findAll('a')
comments_url = subtext_urls[-1]['href']
if not comments_url:
raise RedesignError, "could not find last <a href> in subtext containing comment URL"
comments_url = urljoin(hacker_url, comments_url)
story = Story() = id
story.title = title.encode('utf8')
story.url = url.encode('utf8')
story.score = score
story.comments = comments
story.user = user.encode('utf8')
story.unix_time = unix_time
story.human_time = human_time.encode('utf8')
story.comments_url = comments_url.encode('utf8')
return stories
def _ago_to_unix(ago):
m ='(\d+) (\w+)', ago, re.IGNORECASE)
if not m:
return 0
delta = int(
units =
if not units.endswith('s'): # singular
units += 's' # append 's' to make it plural
if units == "months":
units = "days"
delta *= 30 # lets take 30 days in a month
elif units == "years":
units = "days"
delta *= 365
dt = - datetime.timedelta(**{units: delta})
return int(time.mktime(dt.timetuple()))
def _get_page(url, timeout=10):
""" Gets and returns a web page at url with timeout 'timeout'. """
old_timeout = socket.setdefaulttimeout(timeout)
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')
response = urllib2.urlopen(request)
content =
except (urllib2.HTTPError, urllib2.URLError, socket.error, socket.sslerror), e:
raise SeriousError, e
return content
def _get_next_page(content):
soup = BeautifulSoup(content)
a = soup.find(lambda tag: == 'a' and tag.string == u'More')
if a:
return hacker_url + a['href']
def print_stories_paragraph(stories):
Given a list of Stories, prints them out paragraph by paragraph
for story in stories:
print 'position:', story.position
print 'id:',
print 'title:', story.title
print 'url:', story.url
print 'score:', story.score
print 'comments:', story.comments
print 'user:', story.user
print 'unix_time:', story.unix_time
print 'human_time:', story.human_time
if __name__ == '__main__':
from optparse import OptionParser
description = "A program by Peteris Krumins ("
usage = "%prog [options]"
parser = OptionParser(description=description, usage=usage)
parser.add_option("-p", action="store", type="int", dest="pages",
default=1, help="How many pages of stories to output. Default: 1.")
parser.add_option("-n", action="store_true", dest="new",
help="Retrieve new stories. Default: nope.")
options, args = parser.parse_args()
stories = get_stories(options.pages,
except RedesignError, e:
print >>sys.stderr, "Hacker News have redesigned: %s!" % e
except SeriousError, e:
print >>sys.stderr, "Serious error: %s!" % e