Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit f28b73a
Showing
9 changed files
with
3,343 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,70 @@ | |||
#!/usr/bin/env python | |||
"""PostgreSQL Planet Aggregator | |||
This file contains the functions to suck down RSS/Atom feeds | |||
(using feedparser) and store the results in a PostgreSQL database. | |||
Copyright (C) 2008 PostgreSQL Global Development Group | |||
""" | |||
|
|||
import psycopg2 | |||
import feedparser | |||
import datetime | |||
import socket | |||
|
|||
class Aggregator: | |||
def __init__(self, db): | |||
self.db = db | |||
self.stored = 0 | |||
socket.setdefaulttimeout(20) | |||
|
|||
def Update(self): | |||
feeds = self.db.cursor() | |||
feeds.execute('SELECT id,feedurl,name,lastget FROM planet.feeds') | |||
for feed in feeds.fetchall(): | |||
self.ParseFeed(feed) | |||
self.db.commit() | |||
|
|||
def ParseFeed(self, feedinfo): | |||
#print "Loading feed %s" % (feedinfo[1]) | |||
parsestart = datetime.datetime.now() | |||
feed = feedparser.parse(feedinfo[1], modified=feedinfo[3].timetuple()) | |||
|
|||
if feed.status == 304: | |||
# not changed | |||
return | |||
if feed.status != 200: | |||
# not ok! | |||
print "Feed %s status %s" % (feedinfo[1], feed.status) | |||
return | |||
|
|||
for entry in feed.entries: | |||
if entry.has_key('summary'): | |||
txt = entry.summary | |||
else: | |||
txt = entry.content[0].value | |||
if entry.has_key('guidislink'): | |||
guidisperma = entry.guidislink | |||
else: | |||
guidisperma = True | |||
self.StoreEntry(feedinfo[0], entry.id, entry.date, entry.link, guidisperma, entry.title, txt) | |||
self.db.cursor().execute('UPDATE planet.feeds SET lastget=%(lg)s WHERE id=%(feed)s', {'lg':parsestart, 'feed': feedinfo[0]}) | |||
|
|||
def StoreEntry(self, feedid, guid, date, link, guidisperma, title, txt): | |||
c = self.db.cursor() | |||
c.execute("SELECT id FROM planet.posts WHERE feed=%(feed)s AND guid=%(guid)s", {'feed':feedid, 'guid':guid}) | |||
if c.rowcount > 0: | |||
return | |||
print "Store entry %s from feed %s" % (guid, feedid) | |||
c.execute("INSERT INTO planet.posts (feed,guid,link,guidisperma,dat,title,txt) VALUES (%(feed)s,%(guid)s,%(link)s,%(guidisperma)s,%(date)s,%(title)s,%(txt)s)", | |||
{'feed': feedid, | |||
'guid': guid, | |||
'link': link, | |||
'guidisperma': guidisperma, | |||
'date': date, | |||
'title': title, | |||
'txt': txt}) | |||
self.stored += 1 | |||
|
|||
if __name__=="__main__": | |||
Aggregator(psycopg2.connect('dbname=planetpg host=/tmp/')).Update() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,43 @@ | |||
#!/usr/bin/env python | |||
"""PostgreSQL Planet Aggregator | |||
This file contains the functions to suck down RSS/Atom feeds | |||
(using feedparser), determining the actual blog URL (for the | |||
HTML posts), and update the database with them. | |||
Copyright (C) 2008 PostgreSQL Global Development Group | |||
""" | |||
|
|||
import psycopg2 | |||
import feedparser | |||
import datetime | |||
import socket | |||
|
|||
class Aggregator: | |||
def __init__(self, db): | |||
self.db = db | |||
self.stored = 0 | |||
socket.setdefaulttimeout(20) | |||
|
|||
def Update(self): | |||
feeds = self.db.cursor() | |||
feeds.execute("SELECT id,feedurl,name,blogurl FROM planet.feeds WHERE blogurl='' AND feedurl NOT LIKE '%planet%'") | |||
for feed in feeds.fetchall(): | |||
self.DiscoverFeed(feed) | |||
self.db.commit() | |||
|
|||
def DiscoverFeed(self, feedinfo): | |||
feed = feedparser.parse(feedinfo[1]) | |||
|
|||
if feed.status != 200: | |||
# not ok! | |||
print "Feed %s status %s" % (feedinfo[1], feed.status) | |||
return | |||
|
|||
if feed.feed.link: | |||
print "Setting feed for %s to %s" % (feedinfo[2], feed.feed.link) | |||
c = self.db.cursor() | |||
c.execute("UPDATE planet.feeds SET blogurl='%s' WHERE id=%i" % (feed.feed.link, feedinfo[0])) | |||
|
|||
if __name__=="__main__": | |||
Aggregator(psycopg2.connect('dbname=planetpg host=/tmp/')).Update() |
Oops, something went wrong.