import feedparser, pymongo, json, hashlib, bson, threading, time
from dateutil import parser # For easily parsing strings to Date
from BeautifulSoup import BeautifulSoup # For Parsing descriptions
import keyword_extractor
import shared
class RssFetcher(threading.Thread):
def __init__(self, rss="", verbose=False, sleeptime=500):
self.extractor = keyword_extractor.KeywordExtractor()
self.rss_link = rss
self.verbose = verbose
self.sleeptime = sleeptime
def run(self):
while 1:
# We got the news, so we allow the tweet thread to work
# Sleep for 5 minutes
if self.verbose:
print "[INFO] RSS Thread: Going to sleep for {0}.".format(self.sleeptime)
if self.verbose:
print "[INFO] RSS Thread: Waking up."
shared.flag = True
def gNews_title_fix(title):
"""Gets rid of the final hyphen of the Google News titles
from the google news api"""
dashOccurence = (len(title) - 1) - title[::-1].index('-')
return title[0:dashOccurence]
def gNews_get_link_main_story(link):
"""Get the news URL from a weirdly crafted google news url"""
return link[link.find("&url=")+len("&url="):]
def gNews_get_summary(description):
return BeautifulSoup(description).findAll('div',{'class':'lh'})[0].findAll('font',{'size':'-1'})[1].contents[0]
def gNews_get_link(description):
return BeautifulSoup(description).findAll('div',{'class':'lh'})[0].findAll('font',{'size':'-1'})[-1].a['href']
def getNews(self):
"""Download news stories and put them in the shared list"""
if self.verbose:
print "[INFO] RSS Thread: Fetching news feed from {0}.".format(self.rss_link)
feed = feedparser.parse(self.rss_link)
news_stories = []
for entry in feed["items"]:
if self.verbose:
print "[INFO] RSS Thread: Parsing story {0}.".format(entry["title"])
news_story = {}
news_story["title"] = RssFetcher.gNews_title_fix(entry["title"])
news_story["link_main_story"] = RssFetcher.gNews_get_link_main_story(entry["link"])
news_story["link"] = RssFetcher.gNews_get_link(entry["description"])
news_story["summary"] = RssFetcher.gNews_get_summary(entry["description"])
news_story["date"] = parser.parse(entry["updated"])
news_story["keywords"] = self.extractor.getKeywordsByURL(news_story["link"])
if self.verbose:
print "[INFO] RSS Thread: Putting a new set of stories into the shared list."
shared.stories = news_stories
if __name__ == "__main__":
r = RssFetcher()