Skip to content

Commit

Permalink
Adds short summaries and main story links from the google news
Browse files Browse the repository at this point in the history
  • Loading branch information
mckk committed Nov 4, 2011
1 parent 99de8fb commit 5bcbd51
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 6 deletions.
11 changes: 9 additions & 2 deletions analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ def add_stories_to_mongo(self, stories):
if self.verbose:
print "[INFO] Analysis Thread: Story {0} is in db. Updating date.".format(story["title"])
in_db.update({"$set": {"date": story["date"]}})
if self.verbose:
print "[INFO] Analysis Thread: Story {0} is in db. Updating keywords.".format(story["title"])
in_db.update({"$set": {"keywords": story["keywords"]}})
if self.verbose:
print "[INFO] Analysis Thread: Story {0} is in db. Updating link.".format(story["title"])
in_db.update({"$set": {"link": story["link"]}})
if self.verbose:
print "[INFO] Analysis Thread: Story {0} is in db. Updating main story link.".format(story["title"])
in_db.update({"$set": {"link_main_story": story["link_main_story"]}})

def add_new_time_period_to_stories(self, stories, start, end):
"""Goes through tweets posted between start and end and assigns them to appropriate stories"""
Expand All @@ -60,8 +69,6 @@ def add_new_time_period_to_stories(self, stories, start, end):
print "[INFO] Analysis Thread: Loading tweets from {0} to {1}.".format(start,end)
tweets_in_time_period = self.tweet_collection.find({"created_at": {"$gte": start, "$lt": end}})
for tweet in tweets_in_time_period:
if self.verbose:
print " " + tweet["text"]
for story in stories:
for keyword in story["keywords"]:
keyword_words = keyword.split()
Expand Down
18 changes: 14 additions & 4 deletions rss_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from dateutil import parser # For easily parsing strings to Date

from BeautifulSoup import BeautifulSoup # For Parsing descriptions

import keyword_extractor

import shared
Expand Down Expand Up @@ -39,10 +41,18 @@ def gNews_title_fix(title):
return title[0:dashOccurence]

@staticmethod
def gNews_get_link(link):
def gNews_get_link_main_story(link):
"""Get the news URL from a weirdly crafted google news url"""
return link[link.find("&url=")+len("&url="):]

@staticmethod
def gNews_get_summary(description):
return BeautifulSoup(description).findAll('div',{'class':'lh'})[0].findAll('font',{'size':'-1'})[1].contents[0]

@staticmethod
def gNews_get_link(description):
return BeautifulSoup(description).findAll('div',{'class':'lh'})[0].findAll('font',{'size':'-1'})[-1].a['href']

def getNews(self):
"""Download news stories and put them in the shared list"""
if self.verbose:
Expand All @@ -55,11 +65,11 @@ def getNews(self):
print "[INFO] RSS Thread: Parsing story {0}.".format(entry["title"])
news_story = {}
news_story["title"] = RssFetcher.gNews_title_fix(entry["title"])
news_story["link"] = RssFetcher.gNews_get_link(entry["link"])
news_story["link_main_story"] = RssFetcher.gNews_get_link_main_story(entry["link"])
news_story["link"] = RssFetcher.gNews_get_link(entry["description"])
news_story["summary"] = RssFetcher.gNews_get_summary(entry["description"])
news_story["date"] = parser.parse(entry["updated"])
news_story["keywords"] = self.extractor.getKeywordsByURL(news_story["link"])
if self.verbose:
print " Adding keywords: {0}.".format(news_story["keywords"])
news_stories.append(news_story)

if self.verbose:
Expand Down

0 comments on commit 5bcbd51

Please sign in to comment.