Initial version of new planet code

mhagander · Oct 18, 2008 · f28b73a · f28b73a
commit f28b73a
Show file tree

Hide file tree

Showing 9 changed files with 3,343 additions and 0 deletions.
diff --git a/aggregator.py b/aggregator.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+"""PostgreSQL Planet Aggregator
+
+This file contains the functions to suck down RSS/Atom feeds 
+(using feedparser) and store the results in a PostgreSQL database.
+
+Copyright (C) 2008 PostgreSQL Global Development Group
+"""
+
+import psycopg2
+import feedparser
+import datetime
+import socket
+
+class Aggregator:
+	def __init__(self, db):
+		self.db = db
+		self.stored = 0
+		socket.setdefaulttimeout(20)
+
+	def Update(self):
+		feeds = self.db.cursor()
+		feeds.execute('SELECT id,feedurl,name,lastget FROM planet.feeds')
+		for feed in feeds.fetchall():
+			self.ParseFeed(feed)
+		self.db.commit()
+
+	def ParseFeed(self, feedinfo):
+		#print "Loading feed %s" % (feedinfo[1])
+		parsestart = datetime.datetime.now()
+		feed = feedparser.parse(feedinfo[1], modified=feedinfo[3].timetuple())
+
+		if feed.status == 304:
+			# not changed
+			return
+		if feed.status != 200:
+			# not ok!
+			print "Feed %s status %s" % (feedinfo[1], feed.status)
+			return
+
+		for entry in feed.entries:
+			if entry.has_key('summary'):
+				txt = entry.summary
+			else:
+				txt = entry.content[0].value
+			if entry.has_key('guidislink'):
+				guidisperma = entry.guidislink
+			else:
+				guidisperma = True
+			self.StoreEntry(feedinfo[0], entry.id, entry.date, entry.link, guidisperma, entry.title, txt)
+		self.db.cursor().execute('UPDATE planet.feeds SET lastget=%(lg)s WHERE id=%(feed)s', {'lg':parsestart, 'feed': feedinfo[0]})
+
+	def StoreEntry(self, feedid, guid, date, link, guidisperma, title, txt):
+		c = self.db.cursor()
+		c.execute("SELECT id FROM planet.posts WHERE feed=%(feed)s AND guid=%(guid)s", {'feed':feedid, 'guid':guid})
+		if c.rowcount > 0:
+			return
+		print "Store entry %s from feed %s" % (guid, feedid)
+		c.execute("INSERT INTO planet.posts (feed,guid,link,guidisperma,dat,title,txt) VALUES (%(feed)s,%(guid)s,%(link)s,%(guidisperma)s,%(date)s,%(title)s,%(txt)s)",
+			{'feed': feedid,
+			 'guid': guid,
+			 'link': link,
+			 'guidisperma': guidisperma,
+			 'date': date,
+			 'title': title,
+			 'txt': txt})
+		self.stored += 1
+
+if __name__=="__main__":
+	Aggregator(psycopg2.connect('dbname=planetpg host=/tmp/')).Update()
diff --git a/discovery.py b/discovery.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+"""PostgreSQL Planet Aggregator
+
+This file contains the functions to suck down RSS/Atom feeds 
+(using feedparser), determining the actual blog URL (for the
+HTML posts), and update the database with them.
+
+Copyright (C) 2008 PostgreSQL Global Development Group
+"""
+
+import psycopg2
+import feedparser
+import datetime
+import socket
+
+class Aggregator:
+	def __init__(self, db):
+		self.db = db
+		self.stored = 0
+		socket.setdefaulttimeout(20)
+
+	def Update(self):
+		feeds = self.db.cursor()
+		feeds.execute("SELECT id,feedurl,name,blogurl FROM planet.feeds WHERE blogurl='' AND feedurl NOT LIKE '%planet%'")
+		for feed in feeds.fetchall():
+			self.DiscoverFeed(feed)
+		self.db.commit()
+
+	def DiscoverFeed(self, feedinfo):
+		feed = feedparser.parse(feedinfo[1])
+
+		if feed.status != 200:
+			# not ok!
+			print "Feed %s status %s" % (feedinfo[1], feed.status)
+			return
+
+		if feed.feed.link:
+			print "Setting feed for %s to %s" % (feedinfo[2], feed.feed.link)
+			c = self.db.cursor()
+			c.execute("UPDATE planet.feeds SET blogurl='%s' WHERE id=%i" % (feed.feed.link, feedinfo[0]))
+
+if __name__=="__main__":
+	Aggregator(psycopg2.connect('dbname=planetpg host=/tmp/')).Update()