Use HTMLTidy and some attribute trickery to make output XHTML valid.

Change to XHTML Transitional, because that's really what it is. Fix issue where [...] appendings weren't always rewritten to proper HTML links in the HTML output.
mhagander · Oct 22, 2008 · 7a42825 · 7a42825
1 parent da9733f
commit 7a42825
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 9 deletions.
diff --git a/generator.py b/generator.py
@@ -11,12 +11,22 @@
 import PyRSS2Gen
 import datetime
 import sys
+import tidy
+import urllib
 from HTMLParser import HTMLParser
 from planethtml import PlanetHtml
 
 class Generator:
 	def __init__(self,db):
 		self.db = db
+		self.tidyopts = dict(   drop_proprietary_attributes=1,
+					alt_text='',
+					hide_comments=1,
+					output_xhtml=1,
+					show_body_only=1,
+					clean=1,
+					)
+
 
 	def Generate(self):
 		rss = PyRSS2Gen.RSS2(
@@ -48,6 +58,10 @@ def Generate(self):
 		html.WriteFile("www/index.html")
 
 	def TruncateAndCleanDescription(self, txt, title):
+		# First apply Tidy
+		txt = str(tidy.parseString(txt, **self.tidyopts))
+
+		# Then truncate as necessary
 		ht = HtmlTruncator(1024, title)
 		ht.feed(txt)
 		out = ht.GetText()
@@ -78,10 +92,19 @@ def handle_startendtag(self, tag, attrs):
 		if self.skiprest: return
 		self.trunctxt += self.get_starttag_text()
 
+	def quoteurl(self, str):
+		p = str.split(":",2)
+		return p[0] + ":" + urllib.quote(p[1])
+
+	def cleanhref(self, attrs):
+		if attrs[0] == 'href':
+			return 'href', self.quoteurl(attrs[1])
+		return attrs
+
 	def handle_starttag(self, tag, attrs):
 		if self.skiprest: return
 		self.trunctxt += "<" + tag
-		self.trunctxt += (' '.join([(' %s="%s"' % (k,v)) for k,v in attrs]))
+		self.trunctxt += (' '.join([(' %s="%s"' % (k,v)) for k,v in map(self.cleanhref, attrs)]))
 		self.trunctxt += ">"
 		self.tagstack.append(tag)
 
@@ -102,18 +125,16 @@ def handle_data(self, data):
 		if self.len > self.maxlen:
 			# Passed max length, so truncate text as close to the limit as possible
 			self.trunctxt = self.trunctxt[0:len(self.trunctxt)-(self.len-self.maxlen)]
-			# Terminate at whitespace if possible, max 12 chars back
-			for i in range(len(self.trunctxt)-1, len(self.trunctxt)-12, -1):
-				if self.trunctxt[i].isspace():
-					self.trunctxt = self.trunctxt[0:i] + " [...]"
-					break
 
 			# Now append any tags that weren't properly closed
 			self.tagstack.reverse()
 			for tag in self.tagstack:
 				self.trunctxt += "</" + tag + ">"
 			self.skiprest = True
 
+			# Finally, append the continuation chars
+			self.trunctxt += "[...]"
+
 	def GetText(self):
 		if self.len > self.maxlen:
 			return self.trunctxt

diff --git a/planethtml.py b/planethtml.py
@@ -15,8 +15,8 @@ class PlanetHtml:
 	def __init__(self):
 		self.items = []
 		self.feeds = []
-		self.str = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+		self.str = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" dir="ltr">
  <head>
   <title>Planet PostgreSQL</title>
@@ -46,7 +46,7 @@ def BuildPosts(self):
 		lastdate = None
 		for post in self.items:
 			if post[6].endswith('[...]'):
-				txt = post[6][:len(post[6])-4] + """<a href="%s">continue reading...</a>]""" % (post[1])
+				txt = post[6][:len(post[6])-5] + """<p>[<a href="%s">continue reading...</a>]</p>""" % (post[1])
 			else:
 				txt = post[6]