Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Initial version of new planet code

  • Loading branch information...
commit f28b73a2e01ea82431b56d76d5c93102671c2cbf 0 parents
Magnus Hagander authored
70 aggregator.py
... ... @@ -0,0 +1,70 @@
  1 +#!/usr/bin/env python
  2 +"""PostgreSQL Planet Aggregator
  3 +
  4 +This file contains the functions to suck down RSS/Atom feeds
  5 +(using feedparser) and store the results in a PostgreSQL database.
  6 +
  7 +Copyright (C) 2008 PostgreSQL Global Development Group
  8 +"""
  9 +
  10 +import psycopg2
  11 +import feedparser
  12 +import datetime
  13 +import socket
  14 +
  15 +class Aggregator:
  16 + def __init__(self, db):
  17 + self.db = db
  18 + self.stored = 0
  19 + socket.setdefaulttimeout(20)
  20 +
  21 + def Update(self):
  22 + feeds = self.db.cursor()
  23 + feeds.execute('SELECT id,feedurl,name,lastget FROM planet.feeds')
  24 + for feed in feeds.fetchall():
  25 + self.ParseFeed(feed)
  26 + self.db.commit()
  27 +
  28 + def ParseFeed(self, feedinfo):
  29 + #print "Loading feed %s" % (feedinfo[1])
  30 + parsestart = datetime.datetime.now()
  31 + feed = feedparser.parse(feedinfo[1], modified=feedinfo[3].timetuple())
  32 +
  33 + if feed.status == 304:
  34 + # not changed
  35 + return
  36 + if feed.status != 200:
  37 + # not ok!
  38 + print "Feed %s status %s" % (feedinfo[1], feed.status)
  39 + return
  40 +
  41 + for entry in feed.entries:
  42 + if entry.has_key('summary'):
  43 + txt = entry.summary
  44 + else:
  45 + txt = entry.content[0].value
  46 + if entry.has_key('guidislink'):
  47 + guidisperma = entry.guidislink
  48 + else:
  49 + guidisperma = True
  50 + self.StoreEntry(feedinfo[0], entry.id, entry.date, entry.link, guidisperma, entry.title, txt)
  51 + self.db.cursor().execute('UPDATE planet.feeds SET lastget=%(lg)s WHERE id=%(feed)s', {'lg':parsestart, 'feed': feedinfo[0]})
  52 +
  53 + def StoreEntry(self, feedid, guid, date, link, guidisperma, title, txt):
  54 + c = self.db.cursor()
  55 + c.execute("SELECT id FROM planet.posts WHERE feed=%(feed)s AND guid=%(guid)s", {'feed':feedid, 'guid':guid})
  56 + if c.rowcount > 0:
  57 + return
  58 + print "Store entry %s from feed %s" % (guid, feedid)
  59 + c.execute("INSERT INTO planet.posts (feed,guid,link,guidisperma,dat,title,txt) VALUES (%(feed)s,%(guid)s,%(link)s,%(guidisperma)s,%(date)s,%(title)s,%(txt)s)",
  60 + {'feed': feedid,
  61 + 'guid': guid,
  62 + 'link': link,
  63 + 'guidisperma': guidisperma,
  64 + 'date': date,
  65 + 'title': title,
  66 + 'txt': txt})
  67 + self.stored += 1
  68 +
  69 +if __name__=="__main__":
  70 + Aggregator(psycopg2.connect('dbname=planetpg host=/tmp/')).Update()
43 discovery.py
... ... @@ -0,0 +1,43 @@
  1 +#!/usr/bin/env python
  2 +"""PostgreSQL Planet Aggregator
  3 +
  4 +This file contains the functions to suck down RSS/Atom feeds
  5 +(using feedparser), determining the actual blog URL (for the
  6 +HTML posts), and update the database with them.
  7 +
  8 +Copyright (C) 2008 PostgreSQL Global Development Group
  9 +"""
  10 +
  11 +import psycopg2
  12 +import feedparser
  13 +import datetime
  14 +import socket
  15 +
  16 +class Aggregator:
  17 + def __init__(self, db):
  18 + self.db = db
  19 + self.stored = 0
  20 + socket.setdefaulttimeout(20)
  21 +
  22 + def Update(self):
  23 + feeds = self.db.cursor()
  24 + feeds.execute("SELECT id,feedurl,name,blogurl FROM planet.feeds WHERE blogurl='' AND feedurl NOT LIKE '%planet%'")
  25 + for feed in feeds.fetchall():
  26 + self.DiscoverFeed(feed)
  27 + self.db.commit()
  28 +
  29 + def DiscoverFeed(self, feedinfo):
  30 + feed = feedparser.parse(feedinfo[1])
  31 +
  32 + if feed.status != 200:
  33 + # not ok!
  34 + print "Feed %s status %s" % (feedinfo[1], feed.status)
  35 + return
  36 +
  37 + if feed.feed.link:
  38 + print "Setting feed for %s to %s" % (feedinfo[2], feed.feed.link)
  39 + c = self.db.cursor()
  40 + c.execute("UPDATE planet.feeds SET blogurl='%s' WHERE id=%i" % (feed.feed.link, feedinfo[0]))
  41 +
  42 +if __name__=="__main__":
  43 + Aggregator(psycopg2.connect('dbname=planetpg host=/tmp/')).Update()
2,858 feedparser.py
2,858 additions, 0 deletions not shown
124 generator.py
... ... @@ -0,0 +1,124 @@
  1 +#!/usr/bin/env python
  2 +"""PostgreSQL Planet Aggregator
  3 +
  4 +This file contains the functions to generate output RSS and
  5 +HTML data from what's currently in the database.
  6 +
  7 +Copyright (C) 2008 PostgreSQL Global Development Group
  8 +"""
  9 +
  10 +import psycopg2
  11 +import PyRSS2Gen
  12 +import datetime
  13 +import sys
  14 +from HTMLParser import HTMLParser
  15 +from planethtml import PlanetHtml
  16 +
  17 +class Generator:
  18 + def __init__(self,db):
  19 + self.db = db
  20 +
  21 + def Generate(self):
  22 + rss = PyRSS2Gen.RSS2(
  23 + title = 'Planet PostgreSQL',
  24 + link = 'http://planet.postgresql.org',
  25 + description = 'Planet PostgreSQL',
  26 + generator = 'Planet PostgreSQL',
  27 + lastBuildDate = datetime.datetime.utcnow())
  28 + html = PlanetHtml()
  29 +
  30 + c = self.db.cursor()
  31 + c.execute("SET TIMEZONE=GMT")
  32 + c.execute("SELECT guid,link,dat,title,txt,name,blogurl,guidisperma FROM planet.posts INNER JOIN planet.feeds ON planet.feeds.id=planet.posts.feed ORDER BY dat DESC LIMIT 30")
  33 + for post in c.fetchall():
  34 + desc = self.TruncateAndCleanDescription(post[4], post[3])
  35 + rss.items.append(PyRSS2Gen.RSSItem(
  36 + title=post[5] + ': ' + post[3],
  37 + link=post[1],
  38 + guid=PyRSS2Gen.Guid(post[0],post[7]),
  39 + pubDate=post[2],
  40 + description=desc))
  41 + html.AddItem(post[0], post[1], post[2], post[3], post[5], post[6], desc)
  42 +
  43 + c.execute("SELECT name,blogurl,feedurl FROM planet.feeds ORDER BY name")
  44 + for feed in c.fetchall():
  45 + html.AddFeed(feed[0], feed[1], feed[2])
  46 +
  47 + rss.write_xml(open("www/rss20.xml","w"), encoding='utf-8')
  48 + html.WriteFile("www/index.html")
  49 +
  50 + def TruncateAndCleanDescription(self, txt, title):
  51 + ht = HtmlTruncator(1024, title)
  52 + ht.feed(txt)
  53 + out = ht.GetText()
  54 +
  55 + # Remove initial <br /> tags
  56 + while out.startswith('<br'):
  57 + out = out[out.find('>')+1:]
  58 +
  59 + return out
  60 +
  61 +class HtmlTruncator(HTMLParser):
  62 + def __init__(self, maxlen, title = None):
  63 + HTMLParser.__init__(self)
  64 + self.len = 0
  65 + self.maxlen = maxlen
  66 + self.fulltxt = ''
  67 + self.trunctxt = ''
  68 + self.tagstack = []
  69 + self.skiprest = False
  70 + self.title = title
  71 +
  72 + def feed(self, txt):
  73 + txt = txt.lstrip()
  74 + self.fulltxt += txt
  75 + HTMLParser.feed(self, txt)
  76 +
  77 + def handle_startendtag(self, tag, attrs):
  78 + if self.skiprest: return
  79 + self.trunctxt += self.get_starttag_text()
  80 +
  81 + def handle_starttag(self, tag, attrs):
  82 + if self.skiprest: return
  83 + self.trunctxt += "<" + tag
  84 + self.trunctxt += (' '.join([(' %s="%s"' % (k,v)) for k,v in attrs]))
  85 + self.trunctxt += ">"
  86 + self.tagstack.append(tag)
  87 +
  88 + def handle_endtag(self, tag):
  89 + if self.skiprest: return
  90 + self.trunctxt += "</" + tag + ">"
  91 + self.tagstack.pop()
  92 +
  93 + def handle_entityref(self, ref):
  94 + self.len += 1
  95 + if self.skiprest: return
  96 + self.trunctxt += "&" + ref + ";"
  97 +
  98 + def handle_data(self, data):
  99 + self.len += len(data)
  100 + if self.skiprest: return
  101 + self.trunctxt += data
  102 + if self.len > self.maxlen:
  103 + # Passed max length, so truncate text as close to the limit as possible
  104 + self.trunctxt = self.trunctxt[0:len(self.trunctxt)-(self.len-self.maxlen)]
  105 + # Terminate at whitespace if possible, max 12 chars back
  106 + for i in range(len(self.trunctxt)-1, len(self.trunctxt)-12, -1):
  107 + if self.trunctxt[i].isspace():
  108 + self.trunctxt = self.trunctxt[0:i] + " [...]"
  109 + break
  110 +
  111 + # Now append any tags that weren't properly closed
  112 + self.tagstack.reverse()
  113 + for tag in self.tagstack:
  114 + self.trunctxt += "</" + tag + ">"
  115 + self.skiprest = True
  116 +
  117 + def GetText(self):
  118 + if self.len > self.maxlen:
  119 + return self.trunctxt
  120 + else:
  121 + return self.fulltxt
  122 +
  123 +if __name__=="__main__":
  124 + Generator(psycopg2.connect('dbname=planetpg host=/tmp')).Generate()
9 planet_run.sh
... ... @@ -0,0 +1,9 @@
  1 +#!/bin/sh
  2 +
  3 +PATH=$PATH:/usr/local/bin
  4 +
  5 +cd /home/planetpg/planet
  6 +date >> planet.log
  7 +python aggregator.py >> planet.log 2>&1
  8 +python generator.py >>planet.log 2>&1
  9 +echo Done `date` >> planet.log
107 planethtml.py
... ... @@ -0,0 +1,107 @@
  1 +#!/usr/bin/env python
  2 +"""PostgreSQL Planet Aggregator
  3 +
  4 +This file contains the functions to generate HTML format output.
  5 +It's a fairly ugly hack compared to using a real template
  6 +system, but...
  7 +
  8 +Copyright (C) 2008 PostgreSQL Global Development Group
  9 +"""
  10 +
  11 +import datetime
  12 +
  13 +class PlanetHtml:
  14 + def __init__(self):
  15 + self.items = []
  16 + self.feeds = []
  17 + self.str = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
  18 + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  19 +<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" dir="ltr">
  20 + <head>
  21 + <title>Planet PostgreSQL</title>
  22 + <meta http-equiv="Content-Type" content="text/xhtml; charset=utf-8" />
  23 + <link rel="shortcut icon" href="/favicon.ico" />
  24 + <link rel="alternate" type="application/rss+xml" title="Planet PostgreSQL" href="http://planet.postgresql.org/rss20.xml" />
  25 + <style type="text/css" media="screen" title="Normal Text">@import url("css/planet.css");</style>
  26 + </head>
  27 + <body>
  28 + <div align="center">
  29 + <div id="planetHeader">
  30 + <div class="fl"><img src="http://www.postgresql.org/layout/images/hdr_left.png" border="0" alt="PostgreSQL" /></div>
  31 + <div class="fr"><img width="210" height="80" src="http://www.postgresql.org/layout/images/hdr_right.png" alt="The world's most advanced open source database" /></div>
  32 + <div class="cb"></div>
  33 + </div>
  34 + <div id="planetMain">
  35 +"""
  36 +
  37 + def AddItem(self,guid,link,dat,title,author,blogurl,txt):
  38 + self.items.append((guid,link,dat,title,author,blogurl,txt))
  39 +
  40 + def AddFeed(self,name,blogurl,feedurl):
  41 + self.feeds.append((name,blogurl,feedurl))
  42 +
  43 + def BuildPosts(self):
  44 + self.str += """ <div id="planetLeft">"""
  45 + lastdate = None
  46 + for post in self.items:
  47 + if post[6].endswith('[...]'):
  48 + txt = post[6][:len(post[6])-4] + """<a href="%s">continue reading...</a>]""" % (post[1])
  49 + else:
  50 + txt = post[6]
  51 +
  52 + if lastdate == None or lastdate != post[2].date():
  53 + self.str += """
  54 + <div class="planetNewDate">%s</div>""" % (post[2].date())
  55 + lastdate = post[2].date()
  56 +
  57 + if post[5]:
  58 + posterstr = """<a class="author" href="%s">%s</a>""" % (post[5], post[4])
  59 + else:
  60 + posterstr = post[4]
  61 +
  62 + self.str += """
  63 + <div class="planetPost">
  64 + <div class="planetPostTitle"><a href="%s">%s</a></div>
  65 + <div class="planetPostAuthor">
  66 + <div class="ppa_top">&nbsp;</div>
  67 + <p>Posted by %s on <span class="date">%s at %s</span></p>
  68 + <div class="ppa_bottom">&nbsp;</div>
  69 + </div>
  70 + <div class="planetPostContent">%s</div>
  71 + <div class="cb"></div>
  72 + </div>""" % (post[1], post[3], posterstr, post[2].date(), post[2].time(), txt)
  73 +
  74 + self.str += """ </div>"""
  75 +
  76 + def BuildRight(self):
  77 + self.str += """ <div id="planetRight">
  78 +<div class="planetRightTitle">Subscriptions</div>
  79 +<ul>"""
  80 + for feed in self.feeds:
  81 + self.str += "<li>"
  82 + if feed[1] != '':
  83 + self.str += """<a href="%s">%s</a>""" % (feed[1], feed[0])
  84 + else:
  85 + self.str += feed[0]
  86 + self.str += """
  87 +<a href="%s"><img border="0" src="http://www.postgresql.org/layout/images/ico_rss.png" /></a></li>""" % (feed[2])
  88 + self.str += """ </ul>
  89 + <div class="planetRightTitle">Feeds</div>
  90 + <ul>
  91 + <li><a href="rss20.xml">Planet PostgreSQL</a> <a href="rss20.xml"><img border="0" src="http://www.postgresql.org/layout/images/ico_rss.png"></a></li>
  92 + </ul>
  93 + </div>
  94 +"""
  95 + def WriteFile(self,filename):
  96 + self.BuildPosts()
  97 + self.BuildRight()
  98 + self.str += """
  99 + </div>
  100 + </div>
  101 + </div>
  102 +</body>
  103 +</html>
  104 +"""
  105 + f = open(filename,"w")
  106 + f.write(self.str)
  107 + f.close()
132 www/css/planet.css
... ... @@ -0,0 +1,132 @@
  1 +body {
  2 + font-family: verdana, sans-serif;
  3 + color: #000000;
  4 + background-color: #ffffff;
  5 + margin: 0 0 0 0;
  6 + padding: 0 0 0 0;
  7 + font-size: 13px;
  8 +}
  9 +
  10 +div#planetHeader {
  11 + width: 800px;
  12 + background: url(http://www.postgresql.org/layout/images/hdr_fill.png);
  13 + padding: 0 0 0 0;
  14 + height: 80px;
  15 + margin: 5px 0 2px 0;
  16 +}
  17 +
  18 +div#planetMain {
  19 + width: 800px;
  20 + border: none;
  21 +}
  22 +
  23 +div#planetLeft {
  24 + width: 590px;
  25 + float:left;
  26 + padding-top: 10px;
  27 +}
  28 +
  29 +div.planetNewDate {
  30 + display: none;
  31 + color: #ec5800;
  32 + font-size: 20px;
  33 + text-align: left;
  34 + margin-left: 40px;
  35 +}
  36 +
  37 +div.planetPost {
  38 + margin-bottom: 15px;
  39 + padding: 5px 5px 15px 5px;
  40 + text-align: left;
  41 + border-bottom: thin solid #dadada;
  42 +}
  43 +
  44 +div.planetPostTitle {
  45 + font-size: 20px;
  46 + font-style: bold;
  47 + font-family: verdana, helvetica, arial, sans-serif;
  48 + padding-bottom: 15px;
  49 +}
  50 +
  51 +div.planetPostTitle a {
  52 + text-decoration: none;
  53 +}
  54 +
  55 +div.planetPostTitle a:hover {
  56 + text-decoration: underline;
  57 +}
  58 +
  59 +div.ppa_top {
  60 + margin: 0 10px 0 -10px;
  61 + height: 27px;
  62 + background: url(/img/tleft.png) no-repeat top left;
  63 +}
  64 +
  65 +div.ppa_bottom {
  66 + margin: 0 10px 0 -10px;
  67 + height: 27px;
  68 + background: url(/img/bleft.png) no-repeat top left;
  69 +}
  70 +
  71 +div.planetPostAuthor {
  72 +/* color: #ec5800; */
  73 + width: 57px;
  74 + color: #909090;
  75 + text-align: center;
  76 + font-size: .8em;
  77 + margin: 0;
  78 + padding: 0 10px 0 10px;
  79 + float: left;
  80 + background-color: #dadada;
  81 + overflow: hidden;
  82 +}
  83 +
  84 +div.planetPostAuthor a.author {
  85 + font-style: italic;
  86 +}
  87 +
  88 +div.planetPostAuthor span.date {
  89 + font-style: italic;
  90 +}
  91 +
  92 +div.planetPostContent {
  93 + text-align: left;
  94 +/* float: right;
  95 + width: 480px; */
  96 + margin-left: 85px;
  97 +}
  98 +
  99 +div.planetPostContent img {
  100 + display: block;
  101 + margin-left: auto;
  102 + margin-right: auto;
  103 +max-height: 100px;
  104 +max-width: 100px;
  105 +}
  106 +
  107 +div#planetRight {
  108 + width: 190px;
  109 + margin-top: 10px;
  110 + float: right;
  111 + text-align: left;
  112 +}
  113 +
  114 +div#planetRight div.planetRightTitle {
  115 + font-weight: bold;
  116 + font-size: 20px;
  117 +}
  118 +
  119 +div#planetRight ul {
  120 + list-style: none;
  121 + padding-left: 5px;
  122 +}
  123 +
  124 +div.fl { float: left; border: none; text-align: left; }
  125 +div.fr { float: right; }
  126 +div.cb { clear: both; }
  127 +
  128 +a:link { color:#0085B0; text-decoration: underline; }
  129 +a:visited { color:#004E66; text-decoration: underline; }
  130 +a:active { color:#0085B0; text-decoration: underline; }
  131 +a:hover { color:#000000; text-decoration: underline; }
  132 +
BIN  www/img/bleft.png
BIN  www/img/tleft.png

0 comments on commit f28b73a

Please sign in to comment.
Something went wrong with that request. Please try again.