In [None]:
print("BEGIN sanitize")

In [None]:
assert(len(database)) # process requirement: ensure a database was loaded

# IMPORTS

In [None]:
#import os.path                    # os.path.exists('mydirectory/myfile.txt')
#import pickle                     # load,save files
import csv                        # load,save files

import time                       # performance/completion feedback
import re                         # regex for extracting alpha-numeric terms (not whitepace or punctuation)
from collections import Counter   # 
import gensim                     # ngramming
import nltk                       # parts-of-speech tagging, filter out prepositions (IN) and adverbs (RB*)
                                  # and keep only: Left(POS,2) = NN (noun), left(POS,2) = JJ (adjective),
                                  # left(POS,2) = VB (verb)

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords', download_dir=USERARG_NLTKDATADIR)
from nltk.corpus import stopwords
stop_words = stopwords.words(USERARG_NLTKDATADIR + '/corpora/stopwords/english')
stop_words.extend(['based', 'paper', 'using', 'proposed'])

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# DECLARATIONS & CONSTANTS

In [None]:
module_words = [] # word dictionary/list derived from documents. Intermediate version of 'words'
words        = [] # Counter/Dictionary of words in set of documents. Set at the tail end of this
                  # script so as to de-comingle this critical piece of data from intermediate/mutating
                  # word lists.

iTime = 0     # used to track execution performance

# HELPER FUNCTIONS

In [None]:
def sanitize_filename():
    _TMP_FILE  = 'data/' + USERARG_DATABASEFILE
    _TMP_FILE += '-recs=' + str(USERARG_RECORD_LIMITER)
    _TMP_FILE += '-wrds=' + str(USERARG_FILTER_WORDS)
    _TMP_FILE += '-posf=' + str(USERARG_FILTER_POS)
    _TMP_FILE += '-ngmn=' + str(USERARG_NGRAM_MINCOUNT)
    _TMP_FILE += '-ngtr=' + str(USERARG_NGRAM_THRESHOLD)
    return _TMP_FILE

In [None]:
# TODO THESE METHODS ARE USED IN OTHER IPYs. BUNDLE THEM IN A CLASS AND REUSE

def flat_list_of_wordlists():
    return [word for wordlist in get_list_of_wordlists() for word in wordlist]


def get_list_of_wordlists():
    print("\tCompiling list of word-lists from database[FIELD_BzNDX_ABSTRACT_WORDS]...")
    iTime = time.time()
    return [ record[FIELD_BzNDX_ABSTRACT_WORDS] for record in database[0:] ]


In [None]:
def filter_db_docwordlist_using_words(WORDS, MSG):
    print("\tFiltering abstract-word-list using words-list",MSG)
    iTime = time.time()

    _len1 = len(database[0][FIELD_BzNDX_ABSTRACT_WORDS])
    _count = 0

    for i in range(len(database)):
        tokens = [word for word in database[i][FIELD_BzNDX_ABSTRACT_WORDS] if word in WORDS]
        database[i][FIELD_BzNDX_ABSTRACT_WORDS] = tokens
        _count += len(tokens)

    _len2 = len(database[0][FIELD_BzNDX_ABSTRACT_WORDS])

    if _len2 < _len1:
        print("\t\tConfirmed reduction in document word-list")
    else:
        print("\t\tWARN: Unable to confirm reduction in document word-list")
    print("\t\t",_count,"words currently in corpus")
    
    print("\tCompleted in", round(time.time() - iTime,3),"seconds")
    return None

In [None]:
def ngrammer(LIST_OF_WORDLISTS):
    print("\tBuilding N-gram models...")
    bigram = gensim.models.Phrases(LIST_OF_WORDLISTS, min_count=USERARG_NGRAM_MINCOUNT, threshold=USERARG_NGRAM_THRESHOLD)
        
    if USERARG_VERBOSE==1:
        print("\tExample of a ngram:",bigram)
    
    return [bigram[word] for word in LIST_OF_WORDLISTS] 
    # RETURN is just like db[FIELD_BzNDX_ABSTRACT_WORDS] but includes ngrams


# SANITIZE: PRUNE NONWORDS AND NON-{NN,JJ,VB}

In [None]:
print("\tExtracting wordlists from docs and storing in database[FIELD_BzNDX_ABSTRACT_WORDS]...")
iTime = time.time()

# CONSTANTS --- DO NOT FILTER THE WORD EMPTY_DOCUMENT_VALUE
COMMON_WORDS = ["are","was","has","can","been","the","thus","therefore","be","to","of","and","a","in","that","have","I","it","for","not","on","with","he","as","you","do","at","this","but","his","by","from","they","we","say","her","she","or","an","will","my","one","all","would","there","their","what","so","up","out","if","about","who","get","which","go","me","when","make","can","like","time","no","just","him","know","take","person","into","year","your","good","some","could","them","see","other","than","then","now","look","only","come","its","over","think","also","back","after","use","two","how","our","work","first","well","way","even","new","want","because","any","these","give","day","most","us"]  # ~11% of corpus. REF: https://www.englishclub.com/vocabulary/common-words-100.htm

# -- extract list of words --
module_words = [] # needed here as well so this cell is re-runnable
tokens = []
for j in range(len(database)):
    # split() breaks on 1 char, but regex splits on all punctuation or whitespace
    tokens = re.findall("[\w']+", database[j][FIELD_BzNDX_ABSTRACT].lower())
    
    # rather than multiple passes, clean, filter short, common and non-words now
    tokens = [x for x in tokens if (x.isalpha()) and (len(x) > 2) and (x not in COMMON_WORDS) and (x not in stop_words)]
   
    # save word list to database.records to prevent multi passes
    if len(database[j]) == FIELD_BzNDX_ABSTRACT_WORDS:
        database[j].append(tokens)
    else:
        database[j][FIELD_BzNDX_ABSTRACT_WORDS] = tokens

    # compile unfiltered words
    for i in range(len(tokens)):
        if len(tokens[i]) > 2:
            module_words += [tokens[i]] # double brackets to capture words not letters

re = None
tokens = None
COMMON_WORDS = None
stop_words = None

print("\tCompleted in", round(time.time() - iTime,3),"seconds")

# FWD: words to POS-filter

In [None]:
# [OPTIONAL] 
# Apply parts-of-speech filter to words list and 
# then prune word-lists in database[FIELD_BzNDX_ABSTRACT_WORDS]
# the POS filter needs to be applied before 
# the counter filter in order to remove high
# frequency words that may bump push out keywords.

if USERARG_FILTER_POS==1:
    iTime = time.time()
    print("\tApplying parts-of-speech filter...")
    result = nltk.pos_tag(module_words)

    for i in range(len(module_words)):
        result = nltk.pos_tag([module_words[i]])
        pos_tag = result[0][1]
        if pos_tag[0:2] not in ["NN","JJ","VB"] and module_words[i] not in KEYWORDS:
            module_words[i] = ""
            #print(result)
    module_words[:] = [x for x in module_words if x != ""]
    
    filter_db_docwordlist_using_words(module_words, "after PoS filtering...")
    
    print("\t",len(module_words), "words in database")
    print("\tCompleted in", round(time.time() - iTime,3),"seconds")

words = None # obsolete at this pt, will be recreated from abstract-words later
pos_tag = None

# FWD: None

# AGGREGATE: WORDS TO OFT-REPEATED PHRASES (N-GRAMS)

In [None]:
print("\tNOTE: N-grams are underscore delimited words that tend to appear together")
print("\tNOTE: Repeating this process N-times yields an N-gram")
iTime = time.time()

for i in range(USERARG_NGRAM_N):
    ngrams = ngrammer(get_list_of_wordlists())

print("\tReplacing database[FIELD_BzNDX_ABSTRACT_WORDS] with ngrammed word-list...")
for i in range(len(database)):
    database[i][FIELD_BzNDX_ABSTRACT_WORDS] = ngrams[i]

# _word_list = []
# _index = 0
# _ngram_count = 0
# for i in range(len(ngrams)):
#     _word_list = ngrams[i]
#     for _word in _word_list:
#         module_words.append(_word)
#         if _word.find("_") > -1:
#             _ngram_count += 1
#             if word not in _ngrams:
#                 _ngrams.append(_word)
#                 _index = i
# if USERARG_VERBOSE==1:
#     print("\t", len(_ngrams), "distinct ngrams found a total of ",_ngram_count,"times")
#     print("\tExample document with Ngram is database record",_index)
# _ngrams = None
# _word_list = None
# _index = None
# _ngram_count = None

ngrams = None

print("\tCompleted in", round(time.time() - iTime,3),"seconds")

# FWD: None

In [None]:
print("\tCompiling updated words list...")
iTime = time.time()

module_words = flat_list_of_wordlists()

print("\tNgram report list...")
_ngrams = [w for w in module_words if w.find("_") > -1]
print("\t", len(Counter(_ngrams)), "distinct ngrams found a total of ",len(_ngrams),"times")
print("\tNGrams: ", _ngrams)
_ngrams = None
w = None

print("\tCompleted in", round(time.time() - iTime,3),"seconds")

# FWD: module_words (list), used by frequency-filter

# PRUNE: FILTER BY FREQUENCY

In [None]:
print("\tAnalyzing word-frequency...")
iTime = time.time()

# The Counter object, is essentially a collection of tuples 
# comprised of a unique string and occurence count. 
word_counts = Counter(module_words)
print("\t",len(word_counts), "distinct words")
        
# Paradoxically, Counter.most_common() returns a list of tuples rather 
# than Counter. To get back to a Counter object - required by 
# the NL smart-filters here - I convert the list of tuples
# into a dictionary then pass that into the Counter's constructor,
# but we'll convert it back into a Counter later.
if USERARG_FILTER_WORDS!=0:
    print("\tApplying word-frequency filter...")
    print("\t",USERARG_FILTER_WORDS, "most_common words retained")
    module_words = word_counts.most_common(USERARG_FILTER_WORDS)
    module_words = [w[0] for w in module_words] # strip counts from words
    #-- CRITICAL STEP --
    # now filter the word-lists in the database using 
    # the frequency-filtered and distinct 'words' dictionary.
    filter_db_docwordlist_using_words(module_words, "after filtering on frequency...")

module_words = None
word_counts = None
w = None
print("\tCompleted in", round(time.time() - iTime,3),"seconds")

# FWD: None

# SAVE 'WORDS' AND SANITIZED-DB

CRITICAL: 
TO ALL DATABAASES TO BE CACHED AND USED/SHARED, SAVE ALL
PREPROCESSED RESULTS TO FILES: {database, words}

In [None]:
print("\tConverting words-dictionary into Counter object...")
iTime = time.time()

# last step gauauntee the only words in dictionary used for featureization
words = Counter(flat_list_of_wordlists())
assert(len(words))

print("\tCompleted in", round(time.time() - iTime,3),"seconds")

In [None]:
# word/count dictionary
#print("\tINFO\TYPE: words",type(words)," example:",list(words.items())[:1]) # <key,count>
#print("\tINFO\API: words: e = list(words.elements()) or {k,v} = list(words.{keys,values}())")
#print("\tINFO\TYPE: database",type(database), "dimensions:",len(database),"x",len(database[0]))
#if USERARG_VERBOSE==1:
#    print("\tExample database record:", database[0])

_TMP_FILE = sanitize_filename()

with open(_TMP_FILE + '.words', mode='w', encoding="utf-8") as fp:
    csv_writer = csv.writer(fp, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(words.items())
print('\tINFO: words dictionary cached as ' + _TMP_FILE)

# using "with" will scope the file-handle to this block (auto-closes).
# the database is a list of lists (records) and records are field-arrays.
# The csv.writer writes one record per line.
with open(_TMP_FILE + '.sandb', mode='w', encoding="utf-8") as fp:
    csv_writer = csv.writer(fp, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for record in database:
        csv_writer.writerow(record)
print('\tINFO: sanitized dictionary cached as ' + _TMP_FILE)

In [None]:
print("END sanitize")