Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100755 130 lines (112 sloc) 4.177 kb
f28b73a @mhagander Initial version of new planet code
authored
1 #!/usr/bin/env python
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
2 # vim: ai ts=4 sts=4 sw=4
f28b73a @mhagander Initial version of new planet code
authored
3 """PostgreSQL Planet Aggregator
4
5 This file contains the functions to suck down RSS/Atom feeds
6 (using feedparser) and store the results in a PostgreSQL database.
7
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
8 Copyright (C) 2008-2009 PostgreSQL Global Development Group
f28b73a @mhagander Initial version of new planet code
authored
9 """
10
11 import psycopg2
12 import feedparser
13 import datetime
14 import socket
ce807cd @mhagander Read database from a configfile, so beta can now easily have
authored
15 import ConfigParser
f28b73a @mhagander Initial version of new planet code
authored
16
17 class Aggregator:
18 def __init__(self, db):
19 self.db = db
20 self.stored = 0
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shared ...
authored
21 self.authorfilter = None
f28b73a @mhagander Initial version of new planet code
authored
22 socket.setdefaulttimeout(20)
23
24 def Update(self):
25 feeds = self.db.cursor()
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shared ...
authored
26 feeds.execute('SELECT id,feedurl,name,lastget,authorfilter FROM planet.feeds')
f28b73a @mhagander Initial version of new planet code
authored
27 for feed in feeds.fetchall():
cdd608f @mhagander Handle exceptions during parsing. Showed up when the domain of one of ou...
authored
28 try:
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
29 n = self.ParseFeed(feed)
30 if n > 0:
31 c = self.db.cursor()
32 c.execute("INSERT INTO planet.aggregatorlog (feed, success, info) VALUES (%(feed)s, 't', %(info)s)", {
33 'feed': feed[0],
34 'info': 'Fetched %s posts.' % n,
35 })
cdd608f @mhagander Handle exceptions during parsing. Showed up when the domain of one of ou...
authored
36 except Exception, e:
37 print "Exception when parsing feed '%s': %s" % (feed[1], e)
4e0b730 @mhagander Commit once for each feed. If an exception occurred while parsing it,
authored
38 self.db.rollback()
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
39 c = self.db.cursor()
40 c.execute("INSERT INTO planet.aggregatorlog (feed, success, info) VALUES (%(feed)s, 'f', %(info)s)", {
41 'feed': feed[0],
42 'info': 'Error: "%s"' % e,
43 })
4e0b730 @mhagander Commit once for each feed. If an exception occurred while parsing it,
authored
44 self.db.commit()
f28b73a @mhagander Initial version of new planet code
authored
45
46 def ParseFeed(self, feedinfo):
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
47 numadded = 0
f28b73a @mhagander Initial version of new planet code
authored
48 parsestart = datetime.datetime.now()
49 feed = feedparser.parse(feedinfo[1], modified=feedinfo[3].timetuple())
01557f1 @mhagander Better error checking - show actual exceptions when status field is
authored
50
51 if not hasattr(feed, 'status'):
52 # bozo_excpetion can seemingly be set when there is no error as well,
53 # so make sure we only check if we didn't get a status.
54 if hasattr(feed,'bozo_exception'):
55 raise Exception('Feed load error %s' % feed.bozo_exception)
56 raise Exception('Feed load error with not exception!')
f28b73a @mhagander Initial version of new planet code
authored
57
58 if feed.status == 304:
59 # not changed
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
60 return 0
f28b73a @mhagander Initial version of new planet code
authored
61 if feed.status != 200:
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
62 raise Exception('Feed returned status %s' % feed.status)
f28b73a @mhagander Initial version of new planet code
authored
63
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shared ...
authored
64 self.authorfilter = feedinfo[4]
65
f28b73a @mhagander Initial version of new planet code
authored
66 for entry in feed.entries:
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shared ...
authored
67 if not self.matches_filter(entry):
68 continue
69
149938c @mhagander Re-order the way we try to get the text from a blog post. This
authored
70 # Grab the entry. At least atom feeds from wordpress store what we
71 # want in entry.content[0].value and *also* has a summary that's
72 # much shorter. Other blog software store what we want in the summary
73 # attribute. So let's just try one after another until we hit something.
74 try:
f28b73a @mhagander Initial version of new planet code
authored
75 txt = entry.content[0].value
149938c @mhagander Re-order the way we try to get the text from a blog post. This
authored
76 except:
77 txt = ''
78 if txt == '' and entry.has_key('summary'):
79 txt = entry.summary
80 if txt == '':
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
81 # Not a critical error, we just ignore empty posts
149938c @mhagander Re-order the way we try to get the text from a blog post. This
authored
82 print "Failed to get text for entry at %s" % entry.link
83 continue
84
f28b73a @mhagander Initial version of new planet code
authored
85 if entry.has_key('guidislink'):
86 guidisperma = entry.guidislink
87 else:
88 guidisperma = True
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
89 if self.StoreEntry(feedinfo[0], entry.id, entry.date, entry.link, guidisperma, entry.title, txt) > 0:
90 numadded += 1
91 if numadded > 0:
92 self.db.cursor().execute("UPDATE planet.feeds SET lastget=COALESCE((SELECT max(dat) FROM planet.posts WHERE planet.posts.feed=planet.feeds.id),'2000-01-01') WHERE planet.feeds.id=%(feed)s", {'feed': feedinfo[0]})
93 return numadded
584b3f5 @mhagander Set lastget based on what the RSS feed contained, and not when we last c...
authored
94
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shared ...
authored
95 def matches_filter(self, entry):
96 # For now, we only match against self.authorfilter. In the future,
97 # there may be more filters.
98 if self.authorfilter:
99 # Match against an author filter
100
101 if entry.has_key('author_detail'):
102 return entry.author_detail.name == self.authorfilter
103 else:
104 return False
105
106 # No filters, always return true
107 return True
108
f28b73a @mhagander Initial version of new planet code
authored
109 def StoreEntry(self, feedid, guid, date, link, guidisperma, title, txt):
110 c = self.db.cursor()
111 c.execute("SELECT id FROM planet.posts WHERE feed=%(feed)s AND guid=%(guid)s", {'feed':feedid, 'guid':guid})
112 if c.rowcount > 0:
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
113 return 0
f28b73a @mhagander Initial version of new planet code
authored
114 print "Store entry %s from feed %s" % (guid, feedid)
115 c.execute("INSERT INTO planet.posts (feed,guid,link,guidisperma,dat,title,txt) VALUES (%(feed)s,%(guid)s,%(link)s,%(guidisperma)s,%(date)s,%(title)s,%(txt)s)",
116 {'feed': feedid,
117 'guid': guid,
118 'link': link,
119 'guidisperma': guidisperma,
120 'date': date,
121 'title': title,
122 'txt': txt})
123 self.stored += 1
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored
124 return 1
f28b73a @mhagander Initial version of new planet code
authored
125
126 if __name__=="__main__":
ce807cd @mhagander Read database from a configfile, so beta can now easily have
authored
127 c = ConfigParser.ConfigParser()
128 c.read('planet.ini')
129 Aggregator(psycopg2.connect(c.get('planet','db'))).Update()
Something went wrong with that request. Please try again.