Adds short summaries and main story links from the google news

mckk · Nov 4, 2011 · 5bcbd51 · 5bcbd51
1 parent 99de8fb
commit 5bcbd51
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 6 deletions.
diff --git a/analysis.py b/analysis.py
@@ -43,6 +43,15 @@ def add_stories_to_mongo(self, stories):
         if self.verbose:
           print "[INFO] Analysis Thread: Story {0} is in db. Updating date.".format(story["title"])
         in_db.update({"$set": {"date": story["date"]}})
+        if self.verbose:
+          print "[INFO] Analysis Thread: Story {0} is in db. Updating keywords.".format(story["title"])
+        in_db.update({"$set": {"keywords": story["keywords"]}})
+        if self.verbose:
+          print "[INFO] Analysis Thread: Story {0} is in db. Updating link.".format(story["title"])
+        in_db.update({"$set": {"link": story["link"]}})
+        if self.verbose:
+          print "[INFO] Analysis Thread: Story {0} is in db. Updating main story link.".format(story["title"])
+        in_db.update({"$set": {"link_main_story": story["link_main_story"]}})
 
   def add_new_time_period_to_stories(self, stories, start, end):
     """Goes through tweets posted between start and end and assigns them to appropriate stories"""
@@ -60,8 +69,6 @@ def add_new_time_period_to_stories(self, stories, start, end):
       print "[INFO] Analysis Thread: Loading tweets from {0} to {1}.".format(start,end)
     tweets_in_time_period = self.tweet_collection.find({"created_at": {"$gte": start, "$lt": end}})
     for tweet in tweets_in_time_period:
-      if self.verbose:
-        print "  " + tweet["text"]
       for story in stories:
         for keyword in story["keywords"]:
           keyword_words = keyword.split()

diff --git a/rss_fetcher.py b/rss_fetcher.py
@@ -2,6 +2,8 @@
 
 from dateutil import parser    # For easily parsing strings to Date
 
+from BeautifulSoup import BeautifulSoup # For Parsing descriptions
+
 import keyword_extractor
 
 import shared
@@ -39,10 +41,18 @@ def gNews_title_fix(title):
     return title[0:dashOccurence]
 
   @staticmethod
-  def gNews_get_link(link):
+  def gNews_get_link_main_story(link):
     """Get the news URL from a weirdly crafted google news url"""
     return link[link.find("&url=")+len("&url="):]
 
+  @staticmethod
+  def gNews_get_summary(description):
+    return BeautifulSoup(description).findAll('div',{'class':'lh'})[0].findAll('font',{'size':'-1'})[1].contents[0]
+
+  @staticmethod
+  def gNews_get_link(description):
+    return BeautifulSoup(description).findAll('div',{'class':'lh'})[0].findAll('font',{'size':'-1'})[-1].a['href']
+
   def getNews(self):
     """Download news stories and put them in the shared list"""
     if self.verbose:
@@ -55,11 +65,11 @@ def getNews(self):
         print "[INFO] RSS Thread: Parsing story {0}.".format(entry["title"])
       news_story = {}
       news_story["title"] = RssFetcher.gNews_title_fix(entry["title"])
-      news_story["link"] = RssFetcher.gNews_get_link(entry["link"])
+      news_story["link_main_story"] = RssFetcher.gNews_get_link_main_story(entry["link"])
+      news_story["link"] = RssFetcher.gNews_get_link(entry["description"])
+      news_story["summary"] = RssFetcher.gNews_get_summary(entry["description"])
       news_story["date"] = parser.parse(entry["updated"])
       news_story["keywords"] = self.extractor.getKeywordsByURL(news_story["link"])
-      if self.verbose:
-        print "  Adding keywords: {0}.".format(news_story["keywords"])
       news_stories.append(news_story)
 
     if self.verbose: