Permalink
Browse files

Add BOFH-specific grabber

  • Loading branch information...
1 parent c6f3b71 commit fdfd5fc6c3fc5c536934554b036515b5c0d01695 @palfrey committed Feb 24, 2013
Showing with 88 additions and 6 deletions.
  1. +1 −0 .gitignore
  2. +82 −0 bofh.py
  3. +5 −6 common.py
View
@@ -6,3 +6,4 @@ dump.html
dump
*#*
.idea/
+BOFH-*
View
82 bofh.py
@@ -0,0 +1,82 @@
+from common import *
+from re import compile, DOTALL, MULTILINE
+from urlgrab import Cache
+from urlparse import urljoin
+
+linkPattern = compile("<h3><a href=\"(/[^\"]+)\">(.+?)</a></h3>")
+earlierPattern = compile("<a href='([^\']+)'>.+?Earlier Stories.+?</a>", DOTALL | MULTILINE)
+titlePattern = compile("<h2>(.+?)</h2>")
+subtitlePattern = compile("<p class=\"standfirst\">(.+?)</p>")
+contentPattern = compile("<strong class=\"trailer\">.+?</p>(.+?)(?:(?:<p>(?:(?:<i>)|(?:<small>)|(?:<font size=\"-2\">)|(?:<br>\n))?BOFH .+? Simon Travaglia)|(?:<ul class=\"noindent\">)|(?:<ul>.+?<li><a href=\"http://www.theregister.co.uk/content/30/index.html\">BOFH: The whole shebang</a></li>)|(?:</form>))", DOTALL| MULTILINE)
+adPattern = compile("(<div id=ad-mu1-spot>.+?</div>)", MULTILINE | DOTALL)
+episodePattern = compile("<strong class=\"trailer\">Episode \d+")
+
+url = "http://www.theregister.co.uk/data_centre/bofh/"
+pages = [url]
+cache = Cache()
+
+while True:
+ print url
+ data = cache.get(url).read()
+ links = linkPattern.findall(data)
+
+ if links == []:
+ break
+
+ pages.insert(0, url)
+
+ earlier = earlierPattern.findall(data)
+ url = urljoin(url, earlier[0])
+
+skipTitles = ["Salmon Days is Go!"]
+
+year = None
+
+newItems = False
+
+for mainPage in pages:
+ data = cache.get(mainPage).read()
+ links = linkPattern.findall(data)
+ links.reverse()
+ for l in links:
+ url = urljoin(mainPage, l[0])
+ newyear = url.split("/")[3]
+ if newyear != year:
+ if year != None:
+ if int(newyear) < int(year):
+ raise Exception, (year, newyear)
+ tocEnd(toc)
+ makeMobi(folder, "Simon Travaglia", newitems = newItems)
+ newItems = False
+ folder = "BOFH-%s"%newyear
+ toc = tocStart(folder)
+ year = newyear
+
+ data = cache.get(url, max_age = -1).read()
+ episode = episodePattern.findall(data)
+ if len(episode) == 0:
+ print "Skipping", url
+ continue
+ print url
+ title = titlePattern.findall(data)[0]
+ print title
+ if title in skipTitles:
+ print "skipping", title
+ continue
+ subtitle = subtitlePattern.findall(data)[0]
+ content = contentPattern.findall(data)[0]
+ ad = adPattern.findall(data)[0]
+ content = content.replace(ad, "")
+ content = content.decode('utf-8')
+ title = title.decode("utf-8")
+ subtitle = subtitle.decode("utf-8")
+ assert len(content)>0
+
+ if generatePage(url, title, subtitle + "<br />\n" + content, folder, toc):
+ newItems = True
+ #break
+ print links
+
+tocEnd(toc)
+makeMobi(folder, "Simon Travaglia")
+
View
@@ -1,6 +1,6 @@
from codecs import open
from os import mkdir, system
-from os.path import join, exists
+from os.path import join, exists, getsize
try:
import hashlib
@@ -15,10 +15,10 @@ def hexdigest_md5(data):
return md5.new(data).hexdigest()
def generatePage(page, title, content, folder, toc):
- fname = hexdigest_md5(page) + ".html"
+ fname = unicode(hexdigest_md5(page) + ".html")
fpath = join(folder, fname)
- toc.write("\t\t\t<a title=\"%s\" href=\"%s\" />\n" % (title, fname))
- if not exists(fpath):
+ toc.write(u"\t\t\t<a title=\"%s\" href=\"%s\" />\n" % (title, fname))
+ if not exists(fpath) or getsize(fpath) < 500:
open(fpath, "wb", "utf-8").write(u"""<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<style type="text/css" title="override_css">
@@ -40,7 +40,7 @@ def tocStart(folder):
if not exists(folder):
mkdir(folder)
toc = open(join(folder, "toc.html"), "wb", "utf-8")
- toc.write("""<html xmlns="http://www.w3.org/1999/xhtml">
+ toc.write(u"""<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>%s</title>
</head>
@@ -49,7 +49,6 @@ def tocStart(folder):
""" % folder)
return toc
-
def tocEnd(toc):
toc.write("""\t\t</div>
</body>

0 comments on commit fdfd5fc

Please sign in to comment.