In [26]:
import feedparser
import re

#To know more on feedparser https://www.pythonforbeginners.com/feedparser/using-feedparser-in-python

In [30]:
#Return title and dictionary of word counts for an RSS feed
def getwordcounts(url):
    #parse the feed
    d = feedparser.parse(url)
    wc = {}
    
    #Loop over all the entries
    for e in d.entries:
        if 'summary' in e:
            summary = e.summary
        else:
            summary = e.description
        
        # Extract a list of words
        words = getwords(e.title + ' ' + summary)
        
        for word in words: 
            wc.setdefault(word, 0)
            wc[word]+=1
            
    return (d.feed.title, wc)


In [31]:
def getwords(html):
    #Remove all the HTML tags
    txt = re.compile(r'<[^>]+>').sub('',html)
    
    #Split words by all non alpha characters
    words = re.compile(r'[^A-Z^a-z]+').split(txt)
    
    #Convert to lowercase
    return [word.lower() for word in words if word!='']

In [34]:

apcount = {} #the number of blogs each word appeared in (apcount).
wordcounts = {}

feedlist = [line for line in open('feedlist.txt')]

for feedurl in feedlist:
    try : 
        title, wc = getwordcounts(feedurl)
        wordcounts[title] = wc
        for word, count in wc.items():
            apcount.setdefault(word, 0)
            if count > 1:
                apcount[word]+=1
    except:
        print('Failed to parse feed %s' % feedurl)

Failed to parse feed http://www.topix.com//rss/news/blogs

Failed to parse feed http://gofugyourself.typepad.com/go_fug_yourself/index.rdf

Failed to parse feed https://blog.zawodny.com/feed/

Failed to parse feed http://scobleizer.wordpress.com/feed/

Failed to parse feed http://www.456bereastreet.com/feed.xml

Failed to parse feed http://feeds.dailykos.com/dailykos/index.xml

Failed to parse feed http://www.huffingtonpost.com/raw_feed_index.rdf

Failed to parse feed http://www.hyperorg.com/blogger/index.rdf

Failed to parse feed http://xml.metafilter.com/rss.xml

Failed to parse feed http://www.neilgaiman.com/journal/feed/rss.xml

Failed to parse feed http://www.perezhilton.com/index.xml

Failed to parse feed http://www.plasticbag.org/index.rdf

Failed to parse feed http://feeds.feedburner.com/Spikedhumor

Failed to parse feed http://www.techeblog.com/elephant/?mode=atom

Failed to parse feed http://www.thesuperficial.com/feed

Failed to parse feed http://feeds.gawker.com/gizmodo/ful

In [42]:
'''
The next step is to generate the list of words that will actually be used in the counts for each blog.
Since words like “the” will appear in almost all of them, and others like “flim-flam” might only appear in one,
you can reduce the total number of words included by selecting only those words 
that are within maximum and minimum percentages. 

In this case, you can start with 10 percent as the lower bound and 50 percent as the upper bound,
'''
wordlist = []
for w, bc in apcount.items():
    frac = float(bc)/len(feedlist)
    if frac > 0.2 and frac < 0.6: 
        wordlist.append(w)


In [43]:
'''
The final step is to use the list of words and the list of blogs to create a text file 
containing a big matrix of all the word counts for each of the blogs. 
'''

out = open('blogdata.txt', 'w')
out.write('Blog')

for word in wordlist: 
    out.write('\t%s' % word)

out.write('\n')

for blog, wc in wordcounts.items():
    out.write(blog)
    for word in wordlist:
        if word in wc: 
            out.write('\t%d' % wc[word])
        else: 
            out.write('\t0')
    out.write('\n')
    