Skip to content
Newer
Older
100755 137 lines (118 sloc) 4.19 KB
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
1 #!/usr/bin/env python
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
2 # vim: ai ts=4 sts=4 sw=4
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
3 """PostgreSQL Planet Aggregator
4
5 This file contains the functions to suck down RSS/Atom feeds
6 (using feedparser) and store the results in a PostgreSQL database.
7
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
8 Copyright (C) 2008-2009 PostgreSQL Global Development Group
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
9 """
10
11 import psycopg2
12 import feedparser
13 import datetime
14 import socket
ce807cd @mhagander Read database from a configfile, so beta can now easily have
authored Oct 24, 2008
15 import ConfigParser
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
16
17 class Aggregator:
18 def __init__(self, db):
19 self.db = db
20 self.stored = 0
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shar…
authored Dec 29, 2008
21 self.authorfilter = None
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
22 socket.setdefaulttimeout(20)
23
24 def Update(self):
25 feeds = self.db.cursor()
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shar…
authored Dec 29, 2008
26 feeds.execute('SELECT id,feedurl,name,lastget,authorfilter FROM planet.feeds')
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
27 for feed in feeds.fetchall():
cdd608f @mhagander Handle exceptions during parsing. Showed up when the domain of one of…
authored Oct 27, 2008
28 try:
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
29 n = self.ParseFeed(feed)
30 if n > 0:
31 c = self.db.cursor()
32 c.execute("INSERT INTO planet.aggregatorlog (feed, success, info) VALUES (%(feed)s, 't', %(info)s)", {
33 'feed': feed[0],
34 'info': 'Fetched %s posts.' % n,
35 })
cdd608f @mhagander Handle exceptions during parsing. Showed up when the domain of one of…
authored Oct 27, 2008
36 except Exception, e:
37 print "Exception when parsing feed '%s': %s" % (feed[1], e)
4e0b730 @mhagander Commit once for each feed. If an exception occurred while parsing it,
authored Jan 21, 2009
38 self.db.rollback()
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
39 c = self.db.cursor()
40 c.execute("INSERT INTO planet.aggregatorlog (feed, success, info) VALUES (%(feed)s, 'f', %(info)s)", {
41 'feed': feed[0],
42 'info': 'Error: "%s"' % e,
43 })
4e0b730 @mhagander Commit once for each feed. If an exception occurred while parsing it,
authored Jan 21, 2009
44 self.db.commit()
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
45
46 def ParseFeed(self, feedinfo):
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
47 numadded = 0
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
48 parsestart = datetime.datetime.now()
49 feed = feedparser.parse(feedinfo[1], modified=feedinfo[3].timetuple())
01557f1 @mhagander Better error checking - show actual exceptions when status field is
authored Feb 15, 2009
50
51 if not hasattr(feed, 'status'):
52 # bozo_excpetion can seemingly be set when there is no error as well,
53 # so make sure we only check if we didn't get a status.
54 if hasattr(feed,'bozo_exception'):
55 raise Exception('Feed load error %s' % feed.bozo_exception)
56 raise Exception('Feed load error with not exception!')
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
57
58 if feed.status == 304:
59 # not changed
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
60 return 0
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
61 if feed.status != 200:
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
62 raise Exception('Feed returned status %s' % feed.status)
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
63
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shar…
authored Dec 29, 2008
64 self.authorfilter = feedinfo[4]
65
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
66 for entry in feed.entries:
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shar…
authored Dec 29, 2008
67 if not self.matches_filter(entry):
68 continue
69
149938c @mhagander Re-order the way we try to get the text from a blog post. This
authored Nov 3, 2008
70 # Grab the entry. At least atom feeds from wordpress store what we
71 # want in entry.content[0].value and *also* has a summary that's
649a5c7 @mhagander New way to figure out which text from the RSS flow to include in the …
authored Aug 3, 2009
72 # much shorter.
73 # We therefor check all available texts, and just pick the one that
74 # is longest.
75 txtalts = []
149938c @mhagander Re-order the way we try to get the text from a blog post. This
authored Nov 3, 2008
76 try:
649a5c7 @mhagander New way to figure out which text from the RSS flow to include in the …
authored Aug 3, 2009
77 txtalts.append(entry.content[0].value)
149938c @mhagander Re-order the way we try to get the text from a blog post. This
authored Nov 3, 2008
78 except:
649a5c7 @mhagander New way to figure out which text from the RSS flow to include in the …
authored Aug 3, 2009
79 pass
80 if entry.has_key('summary'):
81 txtalts.append(entry.summary)
82
83 # Select the longest text
84 txt = max(txtalts, key=len)
149938c @mhagander Re-order the way we try to get the text from a blog post. This
authored Nov 3, 2008
85 if txt == '':
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
86 # Not a critical error, we just ignore empty posts
149938c @mhagander Re-order the way we try to get the text from a blog post. This
authored Nov 3, 2008
87 print "Failed to get text for entry at %s" % entry.link
88 continue
89
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
90 if entry.has_key('guidislink'):
91 guidisperma = entry.guidislink
92 else:
93 guidisperma = True
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
94 if self.StoreEntry(feedinfo[0], entry.id, entry.date, entry.link, guidisperma, entry.title, txt) > 0:
95 numadded += 1
96 if numadded > 0:
97 self.db.cursor().execute("UPDATE planet.feeds SET lastget=COALESCE((SELECT max(dat) FROM planet.posts WHERE planet.posts.feed=planet.feeds.id),'2000-01-01') WHERE planet.feeds.id=%(feed)s", {'feed': feedinfo[0]})
98 return numadded
584b3f5 @mhagander Set lastget based on what the RSS feed contained, and not when we las…
authored Oct 20, 2008
99
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shar…
authored Dec 29, 2008
100 def matches_filter(self, entry):
101 # For now, we only match against self.authorfilter. In the future,
102 # there may be more filters.
103 if self.authorfilter:
104 # Match against an author filter
105
106 if entry.has_key('author_detail'):
107 return entry.author_detail.name == self.authorfilter
f1d609d @mhagander Support author filter for things like wordpress blogs - simpler
authored Aug 3, 2009
108 elif entry.has_key('author'):
109 return entry.author == self.authorfilter
281ccb7 @mhagander Support filtering feeds by author names, to pull only parts of a shar…
authored Dec 29, 2008
110 else:
111 return False
112
113 # No filters, always return true
114 return True
115
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
116 def StoreEntry(self, feedid, guid, date, link, guidisperma, title, txt):
117 c = self.db.cursor()
118 c.execute("SELECT id FROM planet.posts WHERE feed=%(feed)s AND guid=%(guid)s", {'feed':feedid, 'guid':guid})
119 if c.rowcount > 0:
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
120 return 0
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
121 print "Store entry %s from feed %s" % (guid, feedid)
122 c.execute("INSERT INTO planet.posts (feed,guid,link,guidisperma,dat,title,txt) VALUES (%(feed)s,%(guid)s,%(link)s,%(guidisperma)s,%(date)s,%(title)s,%(txt)s)",
123 {'feed': feedid,
124 'guid': guid,
125 'link': link,
126 'guidisperma': guidisperma,
127 'date': date,
128 'title': title,
129 'txt': txt})
130 self.stored += 1
cf93080 @mhagander Make the aggregator write it's log to the database instead of just
authored Jan 31, 2009
131 return 1
f28b73a @mhagander Initial version of new planet code
authored Oct 18, 2008
132
133 if __name__=="__main__":
ce807cd @mhagander Read database from a configfile, so beta can now easily have
authored Oct 24, 2008
134 c = ConfigParser.ConfigParser()
135 c.read('planet.ini')
136 Aggregator(psycopg2.connect(c.get('planet','db'))).Update()
Something went wrong with that request. Please try again.