In [175]:
from bs4 import BeautifulSoup
import urllib, csv, re
import pandas as pd
import numpy as np

import itertools
from nltk.corpus import stopwords, wordnet
english_stopwords = stopwords.words('english')

from nltk import PorterStemmer
stem = PorterStemmer()

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [176]:
urlFile = 'TextFile/storyURL.txt' # read input
termdoc_File = 'TextFile/Term_Document.csv'
target_File = 'TextFile/Label_category.csv'


### extract Contents/Title of story/blog

In [177]:
def getContents(soup):
    ignoreTags = ('a','script','noscript','em', 'iframe') # unwanted tag section
    subtmp = str(soup)
    for tg in ignoreTags:
        if tg == 'a': sec_del = soup.find_all(tg, href='javascript:void(0);')
        else: sec_del = soup.find_all(tg)
        for sd in sec_del:
            subtmp = subtmp.replace(str(sd), "")

    newsec = BeautifulSoup(subtmp, 'lxml')
    txtTags = ('p', 'h1', 'h2', 'h3', 'h4', 'th', 'li') # desired tag section
    text = ''
    for tg in txtTags:
        if tg == 'p':
            subs = newsec.findAll(tg, {"class":None}) # find 'p' tag with no class attr
            if subs:
                del subs[-1] # remove last 'p' section (Twitter)
            else:
                subs = ''
        else:
            subs = newsec.findAll(tg, {"class":re.compile('[^(button)]')}) # not word on Button
        for ss in subs:
            text = text + ss.get_text() + ' '
#     text = newsec.get_text()
    
    return text

# def getTitle(soup):
#     head = soup.find('h1')
#     pageTitle = head.get_text()
#     return pageTitle

### parse Non-Printable Blog page 

In [178]:
def parseNonPrintBlog(soup): # this is for PageType = 'blog'
    sec = soup.select("div[id=blog-content] div[id=blog-left-pane] div[class=blog-post-content]")
    text = ''
    if len(sec) >0: # single page
        text += getContents(sec[0]) 
    return text

### parse Printable Story page

In [179]:
def parsePrintStory(soup): # this is for PageType = 'story'
    text = ''
    sec = soup.select("div[class=slideshow-text]") # multi-page story as slideshow
    if len(sec) >0:
        for ss in sec:  # multiple page
            text += getContents(ss)
    else:
        ssec = soup.select("div[class*=storypage] div[class^=bgWhite]")
        if len(ssec) > 0:  # single page
            text += getContents(ssec[0])
    return text

### parse to get target label 'Category', 'Adkeyword'

In [180]:
def extractTarget(soup):

    patternCategory = re.compile("PageManager\.PageMetaData\.Add\('Category','([\w, %\./\\-]*)'\)", re.I)

    parag = soup.head.find(text = patternCategory)
    try:
        m = re.search(patternCategory, parag)
        if m:   
            category = m.group(1)
            category = wnl.lemmatize(category.strip())
            category = stem.stem_word(category)
            return category
    except:
        print 'No category found'
        return ''


### Replace Contraction

In [181]:
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'I\'m', 'I am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
 ]
class RegexpReplacer(object):
   def __init__(self, patterns=replacement_patterns):

      # Fixed this line - "patterns", not "pattern"
      self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]

   def replace(self, text):
      s = text
      for (pattern, repl) in self.patterns:
          (s, count) = re.subn(pattern, repl, s)

      return s
replaceContract=RegexpReplacer()

### Tokenizing

In [182]:
# Add bigram Tokens
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return [ngram for ngram in itertools.chain(words, bigrams)]

num_string = 'NUMVAR'
def stopwd_bigram_stem(content):
    tokens = word_tokenize(content.lower())
    
    # Lemmanization
    tokens = [wnl.lemmatize(word.strip()) for word in tokens]  

    # remove stop words
    tokens = [word for word in tokens if word not in english_stopwords]

    # remove '.' in Acronym e.g. U.S -> US
    tokens =  [re.sub(r'\.', '', word) if re.search(r'([a-zA-Z]\.)+', word) else word for word in tokens]
    
    # Convert digit number to 'NUMVAR' string
#     tokens_tmp = []
#     for word in tokens:
#         if re.search(r'[a-zA-Z]+', word):
#             tokens_tmp.append(word)
#         elif re.search(r'\d+', word):
#             tokens_tmp.append(num_string)
#     tokens = tokens_tmp
    
    # Add bigram Tokens
    tokens = bigram_word_feats(tokens, n = int(len(tokens)/3))
    
    # stemming
    tokens = [stem.stem_word(word) for word in tokens]

    docTerm = list()
    for tk in tokens:
        
        if type(tk) == tuple: wd = '__'.join(tk) # for a bi-gram
        else: wd = tk

#         if re.search(r'[a-zA-Z]+', wd): # contain alphabatic character, remain
#             docTerm.append(wd)
#         elif re.search(r'\d+', wd): # contain number but not alphabatic
#             if re.search(r'(19\d{2})|(20\d{2})'):
#                 wd = 'YEAR'
#             elif re.search

        docTerm.append(wd)
    docTerm = ' '.join(docTerm)
    
    return docTerm

### Read story URL File

In [183]:
docList = list()
tagCategory_List = list()
with open(urlFile, 'rb') as urlHandle:
    urlReader = csv.reader(urlHandle, delimiter = ',', quotechar = "'")
    for rnum, row in enumerate(urlReader):
#         if (rnum >426) and (rnum<430): #break## for test run
            url = row[0].strip()
            print_url = re.sub('http://www.bankrate.com', 
                               'http://www.bankrate.com/system/util/print.aspx?p=', url)# use print_url to de-pagintion

            response = urllib.urlopen(print_url)
            content = ''         
            if response.code == 200:   # printed-version story
                html = response.read()
                bsObj = BeautifulSoup(html, 'lxml')
                content = parsePrintStory(bsObj.body)                    # extract page contents
                bsoup = BeautifulSoup(urllib.urlopen(url).read(),'lxml')
                tag_Category = extractTarget(bsoup)                  # extract page category
            else:                      # non-printed version blog
                requests = urllib.urlopen(url)
                if requests.code == 200:
                    html = requests.read()
                    bsObj = BeautifulSoup(html,'lxml')
                    content = parseNonPrintBlog(bsObj.body)              # extract page contents
                    tag_Category = extractTarget(bsObj)             # extract page category

    #         title = getTitle(bsObj) # extract page Title
            content = replaceContract.replace(content) # replace contraction, e.g. I'll->I will
            content_ps = stopwd_bigram_stem(content) # preprocessing stopword, bigram, stemming

            docList.append(content_ps)
            tagCategory_List.append(tag_Category)




No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found
No category found


### Term - Document matrix

In [184]:
# docTovect = CountVectorizer(max_df= 0.95, min_df= 2, strip_accents = 'ascii') # tokenization and count
docTovect = TfidfVectorizer(max_df= 0.95, min_df= 2, strip_accents = 'ascii'   # tokenization and count
#                             , norm = 'l2'
#                             , sublinear_tf  = True
                            ) 

docTovect_fit = docTovect.fit(docList)
term_document = docTovect_fit.transform(docList)
term_document_df = pd.DataFrame.from_records(term_document.toarray(), columns=docTovect.vocabulary_)
tag_Category_df = pd.Series(np.asarray(tagCategory_List))

In [187]:
# save data into .csv
Handle1 = open(termdoc_File, 'wb')
Handle2 = open(target_File, 'wb')
term_document_df.to_csv(Handle1, index = False)
tag_Category_df.to_csv(Handle2, index = False)
Handle1.close()
Handle2.close()

In [186]:
# soup = bsObj.body
# ignoreTags = ('a','script','noscript','em', 'iframe') # unwanted tag section
# newsec = ''
# subtmp = str(soup)
# for tg in ignoreTags:
#     if tg == 'a': sec_del = soup.find_all(tg, href='javascript:void(0);')
#     else: sec_del = soup.find_all(tg)
#     for sd in sec_del:
#         subtmp = subtmp.replace(str(sd), "")

# # print soup
# print subtmp
# # sec_del = soup.find_all('a', href='javascript:void(0);')
# # print subtmp.replace(str(sec_del[0]), "")
url

'http://www.bankrate.com/finance/savings/tips/2014.aspx'