In [1]:
# preliminaries

from collections import *
from dataframe import *

import os
import re
import nltk

from nltk.corpus import stopwords
import numpy as np

In [2]:
# language lookup table
knowledge = {}
knowledge["haven't"] = "have not"
knowledge["hasn't"] = "has not"
knowledge["hadn't"] = "had not"
knowledge["doesn't"] = "does not"
knowledge["don't"] = "do not"
knowledge["didn't"] = "did not"
knowledge["couldn't"] = "could not"
knowledge["mustn't"] = "must not"
knowledge["can't"] = "can not"
knowledge["hadn't"] = "had not"
knowledge["won't"] = "will not"
knowledge["wouldn't"] = "would not"
knowledge["i'm"] = "i am"
knowledge["it's"] = "it is"
knowledge["let's"] = "let us"


In [3]:
# base
class TSBase(object):
    def build_internal(self, document_set):
        """
        document_set is set or list of (review_id, review_text) tuple.
        """
        # internal document mapping, implicitly index is the internal id, same for ism (sentences)
        self._idm = [document for document in document_set]
        self._ism = [(doc_id, sentence) for doc_id, doc in enumerate(self._idm) for sentence in nltk.sent_tokenize(doc[1]) ]

        self._world = " ".join([text[1] for text in self._idm])
        self._world = " ".join(self._world.split())
        
        self._world = self._world.lower()
        
        # global static look up table for contraction must be present
        replace_contraction = re.compile(r'\b(' + '|'.join(knowledge.keys()) + r')\b')
        self._world = replace_contraction.sub(lambda x: knowledge[x.group()], self._world)
        
        # caveat orginal inclues
        """
        | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
        """
        pattern = r'''(?x)          # set flag to allow verbose regexps
            (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
          | \w+(?:-\w+)*        # words with optional internal hyphens
          | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
          | \.\.\.              # ellipsis
          | \w+(?:'\w+)*        # words that have ' in between
        '''
        self._world_words = nltk.regexp_tokenize(self.world(), pattern)
        self._world_filtered_words = [word for word in self._world_words if word not in stopwords.words('english')]
        

        # world term frequency
        self.build_world_tf()
        self.build_idf()

        # persistence
        import pickle

        file = open(r'dataset/world_words.pkl', 'wb')
        pickle.dump(self._world_words, file)
        file.close()
        
        file = open(r'dataset/world_words_document_matrix.pkl', 'wb')
        pickle.dump(self._world_words_document_matrix, file)
        file.close()

    def load_internal():
        #reload object from file
        words = open(r'dataset/world_words.pkl', 'rb')
        self._world_words = pickle.load(words)
        words.close()


        
    
    def world(self):
        return self._world
    def world_words(self):
        return self._world_words
    def world_filtered_words(self):
        return self._world_filtered_words

    def world_words_set(self):
        if hasattr(self, '_world_words_set') and len(self._world_words_set()) != 0:
            return self._world_words_set
        else:
            self._world_words_set = sorted(list(set(self.world_words())))
            return self._world_words_set
    def world_filtered_words_set(self):
        if hasattr(self, '_world_filtered_words_set')  and len(self._world_filtered_words_set()) != 0:
            return self._world_filtered_words_set
        else:
            self._world_filtered_words_set = sorted(list(set(self.world_filtered_words())))
            return self._world_filtered_words_set
    
    def world_tf(self):
        return self._world_tf
    def world_filtered_tf(self):
        return self._world_filtered_tf
    
    def build_world_tf(self):
        """
        this is worlds word hash
        """
        from collections import Counter
        self._world_tf = Counter(self.world_words())
        self._world_filtered_tf = Counter(self.world_filtered_words())
    
    def build_idf(self):
        self._world_words_document_matrix = np.zeros((len(self._idm),len(self.world_words_set())))
        return self._world_words_document_matrix



In [4]:
# test for base
if not os.path.exists('dataset'):
    print('fak, fix it')
else:
    DOC_PREFIX = 'dataset/text/documents/raw'
    txts = os.listdir(DOC_PREFIX)[:6]
#     txts = os.listdir(DOC_PREFIX) # all, caution, should use parallelism to speed up
    counter = 0
    docs = deque()
    for t in txts:
        with open(os.path.join(DOC_PREFIX,t), 'r') as f:
            raw = f.read()
            doc_id = os.path.splitext(os.path.basename(f.name))[0]
#             print(doc_id)
            docs.append((doc_id,raw))
            counter+=1
    
    tsbase = TSBase()
    tsbase.build_internal(docs)
#     print(tsbase.world())
#     print(tsbase.words())
#     print(tsbase.world_filtered_tf())
    wtf = tsbase.world_tf()
    print(len(wtf.keys()))

198


In [5]:

# centrality of sentence := \sum qualified(words); where qualified(w) = if w in pseudo_document ? 1 : 0

In [6]:


text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x)          # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''
nltk.regexp_tokenize(text, pattern)


['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']