In [1]:
# preliminaries

from collections import *
from dataframe import *

import os
import re
import nltk

from nltk.corpus import stopwords

In [2]:
# language lookup table
knowledge = {}
knowledge["haven't"] = "have not"
knowledge["hasn't"] = "has not"
knowledge["hadn't"] = "had not"
knowledge["doesn't"] = "does not"
knowledge["don't"] = "do not"
knowledge["didn't"] = "did not"
knowledge["couldn't"] = "could not"
knowledge["mustn't"] = "must not"
knowledge["can't"] = "can not"
knowledge["hadn't"] = "had not"
knowledge["won't"] = "will not"
knowledge["wouldn't"] = "would not"
knowledge["i'm"] = "i am"
knowledge["it's"] = "it is"
knowledge["let's"] = "let us"


In [3]:
# base
class TSBase(object):
    def build_sentence_db(self, document_set):
        """
        document_set is set or list of (review_id, review_text) tuple.
        """
        # internal document mapping, implicitly index is the internal id, same for ism (sentences)
        idm = [document for document in document_set]
        ism = [(doc_id, sentence) for doc_id, doc in enumerate(idm) for sentence in nltk.sent_tokenize(doc[1]) ]

        self._world = " ".join([text[1] for text in idm])
        self._world = " ".join(self._world.split())
        
        self._world = self._world.lower()
        
        # global static look up table for contraction must be present
        replace_contraction = re.compile(r'\b(' + '|'.join(knowledge.keys()) + r')\b')
        self._world = replace_contraction.sub(lambda x: knowledge[x.group()], self._world)
        
        # caveat orginal inclues
        """
        | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
        """
        pattern = r'''(?x)          # set flag to allow verbose regexps
            (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
          | \w+(?:-\w+)*        # words with optional internal hyphens
          | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
          | \.\.\.              # ellipsis
          | \w+(?:'\w+)*        # words that have ' in between
        '''
        self._world_words = nltk.regexp_tokenize(self.world(), pattern)
        self._world_filtered_words = [word for word in self._world_words if word not in stopwords.words('english')]
        

        # world term frequency
        self.build_world_tf()
        
    
    def world(self):
        return self._world
    def world_words(self):
        return self._world_words
    def world_filtered_words(self):
        return self._world_filtered_words

    def world_tf(self):
        return self._world_tf
    def world_filtered_tf(self):
        return self._world_filtered_tf
    
    def build_world_tf(self):
        from collections import Counter
        self._world_tf = Counter(self.world_words())
        self._world_filtered_tf = Counter(self.world_filtered_words())
    
    
    

# test for base
if not os.path.exists('dataset'):
    print('fak, fix it')
else:
    import codecs
    
    DOC_PREFIX = 'dataset/text/documents'
    txts = os.listdir(DOC_PREFIX)[:6]
    counter = 0
    docs = deque()
    for t in txts:
        with codecs.open(os.path.join(DOC_PREFIX,t),'r','utf-8') as f:
            raw = f.read()
            doc_id = os.path.splitext(os.path.basename(f.name))[0]
#             print(doc_id)
            docs.append((doc_id,raw))
            counter+=1
    
    tsbase = TSBase()
    tsbase.build_sentence_db(docs)
#     print(tsbase.world())
#     print(tsbase.words())
    print(tsbase.world_filtered_tf())


Counter({'stayed': 4, 'hotel': 4, 'bar': 4, 'location': 4, 'river': 4, 'one': 3, 'food': 3, 'breakfast': 3, 'staff': 3, 'could': 3, 'nice': 3, 'great': 3, 'clean': 3, 'amazing': 2, 'good': 2, 'last': 2, 'christmas': 2, 'experience': 2, 'time': 2, 'place': 2, 'want': 2, 'five': 2, 'minute': 2, 'walk': 2, 'lee': 2, 'excellent': 2, 'city': 2, 'rooms': 2, 'price': 2, 'room': 2, 'perfect': 2, 'dublin': 2, 'night': 1, 'sisters': 1, 'dinner': 1, 'included': 1, 'massage': 1, 'spa': 1, 'heaven': 1, 'def': 1, 'recommend': 1, 'stay': 1, 'back': 1, 'enough': 1, 'help': 1, 'us': 1, 'spotless': 1, 'blessed': 1, 'weather': 1, 'sit': 1, 'outside': 1, 'lovely': 1, 'gardens': 1, 'girlfriend': 1, 'got': 1, 'engaged': 1, 'shelbourne': 1, '2012': 1, 'visited': 1, 'suites': 1, 'life': 1, 'everything': 1, 'class': 1, 'relaxed': 1, 'pint': 1, 'glass': 1, 'wine': 1, 'head': 1, 'horse': 1, 'shoe': 1, 'whilst': 1, 'front': 1, 'little': 1, 'market': 1, 'sophisticated': 1, 'watch': 1, 'celebs': 1, 'also': 1, 'beau

In [4]:

# centrality of sentence := \sum qualified(words); where qualified(w) = if w in pseudo_document ? 1 : 0

In [5]:


text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x)          # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''
nltk.regexp_tokenize(text, pattern)


['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']