# PRE-PROCESSING TEXT FOR NATURAL LANGUAGE GENERATION

## GETTING AND LOADING A CORPUS FROM GUTENBERG

https://nlpforhackers.io/corpora/
http://lucumr.pocoo.org/2015/11/18/pythons-hidden-re-gems/

The goal of this step is to develop an initial list of each character and their spoken lines, or a cleanish list of the lines within the text. (Dictionaries are Hash value arbitrary, so may not be ordered the same. Lists are used instead.)

In [38]:
# Select and Read a file into "f" using a list and stripping out all Project Gutenberg headers and footers

from pathlib import Path
import re
data_folder = Path("data/murray_kempton/")
file_to_open = data_folder / "On_Frank_Sinatra.txt"
f = open(file_to_open, 'r')
first_document = list(f)
#print(first_document)

# Determine whether a Project Gutenberg Text
first_header_index = 0
second_header_index = 0
footer_index = 0
if any("GUTENBERG" in s for s in first_document):
    for first_header_index in range( len(first_document) ):
        if ( ( first_document[first_header_index].find('*END*THE SMALL PRINT!') ) != -1 ) :
            break
        else:
            for first_header_index in range( len(first_document) ):
                if ( ( first_document[first_header_index].find('START OF THIS PROJECT GUTENBERG') ) != -1 ) :
                    break        
        
    second_document = list(first_document[first_header_index + 1 :])

    for second_header_index in range( len(second_document) ):
        if ( ( second_document[second_header_index].find('www.gutenberg.org') ) != -1 ) :
            break            
    for footer_index in range( len(first_document) ):
        if ( ( first_document[footer_index].find('End of Project') ) != -1 ) :
            break
        else:
            for footer_index in range( len(first_document) ):
                if ( ( first_document[footer_index].find('End of the Project') ) != -1 ) :
                    break    
        
    print(first_header_index)            
    print(second_header_index)
    print(footer_index)     
    
    script = list()
    if (second_header_index < (first_header_index + 100)):
        script = list(first_document[first_header_index +1 + second_header_index +1 : footer_index-1])
    else:
        script = list(first_document[first_header_index +1 : footer_index-1])
else:
    script = first_document

#print(script)

## IDENTIFYING SPEAKERS AND PARTS FROM SCRIPTS

In [39]:
#Compile a list of speakers
r = "([A-Z]+[a-z]+[ ]+[A-Za-z_]+\\n+)|([A-Z0-9][A-Z0-9]+\\n+)"
rg = re.compile(r, re.MULTILINE)
speakers = []
for line in script:
    mtch = rg.match(line)
    if mtch:
        speakers.append(mtch.group())
#Remove all line returns
for l in range(len(speakers)):
    speakers[l] = speakers[l].replace('\n', '') 
    
print(speakers)

['Murray Kempton', 'ADVERTISING', 'Whitney Balliett']


# 11/17 START HERE

In [40]:
#Omit speakers Section Numbers and line numbers from the list of lines
s = re.compile("(\b[A-Z{3}\.]+\b)|([0-9]*\.+\\*n+\\n)")
spoken = list(filter(lambda i: not s.search(i), script))

    ##\\n
    ##.*(\'.*\')
    ##[0-9]*\.+\\*n
print(spoken)


['On Frank Sinatra (1915–1998)\n', 'Murray Kempton and Whitney Balliett JUNE 25, 1998 ISSUE\n', '1.\n', '\n', 'Murray Kempton\n', '\n', 'The following was written in December 1996, when Frank Sinatra’s retirement was announced.\n', '\n', 'Frank Sinatra ever did the fullest duty to his art, and now he is leaving us with the duty to sum him up. My betters have already done that. One day I was dealing with Ella Fitzgerald, and the subject of Sinatra came up and her intruder-mistrusting voice suddenly softened and she said, “Frank. Just this little guy telling this story. That’s all you have to be.”\n', '\n', 'In 1956, Nelson Riddle thought to employ the Hollywood String Quartet as backup for Sinatra’s “Close to You” album. The HSQ found Sinatra as demanding as Schoenberg had been six years before, when it recorded “Verklärte Nacht” and so gratified its composer that he felt himself fully defined and registered his satisfaction by writing the liner notes. Sinatra asked not a whit less than

## IDENTIFYING SENTENCES AND PHRASES

During this step, we concatenate lines in batches to allow the identification of sentences with regular expressions. Then we identify phrases with stop words.

In [41]:
#Concatenate lines into list entries for future sentence splitting

newLines = []
singleLine = ''
singleLines = []

#Remove all line returns
for j in range(0, len(spoken)):
    spoken[j] = spoken[j].replace('\n', '')
    spoken[j] = spoken[j].replace('1.', '')
    
#Split 5 lines at a time into new list
for k in range( 0, len(spoken), 3):
    newLines = []
    for line in range( 0, 3 ):
        try:
            newLines.append(' '+spoken[line+k])
        except:
            #print("Index Error at", k, line)
            break
    #Join 5-line groups into one line and append to a list
    singleLine = ''.join(newLines)
    singleLines.append(singleLine)

#print(singleLines)

## CREATING A LIST OF SENTENCES FROM A TEXT

In [None]:
#Create list of sentences
sentences = []
for m in range(0, len(singleLines)):
    mtch = re.findall("\A[A-Z]*[^\.!?]*[\.!?]", singleLines[m], re.M|re.I)
    if mtch:
        sentences.append(mtch)
print(sentences)

## CREATING A LIST OF PHRASES

In [None]:
#Clean the stopword list
stoplist = []
clean_line = []
data_folder = Path("data/")
file_to_open = data_folder / "snowball_stop.txt"
f = open(file_to_open, 'r')
full_stop = list(f)

for n in range( 0, len(full_stop), 1 ):
    clean_line = full_stop[n].split('|')
    stoplist.append(clean_line[0])

for p in range(len(stoplist)):
    stoplist[p] = stoplist[p].replace('\n', '')
    
#print(stoplist)

In [None]:
# Create list of phrases using stopwords
phrases = []
candidate_phrases = []

for q in range(len(sentences)):
    for r in sentences[q]:
        words = re.split("\\s+", r)
        previous_stop = False
 
        # Examine each word to determine if it is a phrase boundary marker or part of a phrase or alone
        for w in words:
 
            if w in stoplist and not previous_stop:
                # phrase boundary encountered, so put a hard indicator
                candidate_phrases.append(";")
                previous_stop = True
            elif w not in stoplist and len(w) > 3:
                # keep adding words to list until a phrase boundary is detected
                candidate_phrases.append(w.strip())
                previous_stop = False
 
    # Create a list of candidate phrases without boundary demarcation
    phrases = re.split(";+", ' '.join(candidate_phrases))

# Clean up phrases    
re2 = re.compile('[^\.!?,"(){}\*:]*[\.!?,"(){}\*:]')
for s in range(len(phrases)):
    phrases[s] = re.sub(re2, '', phrases[s])
    phrases[s] = phrases[s].strip(' ')
    phrases[s] = phrases[s].replace(' ', '_')
    phrases[s] = phrases[s].replace('__', '_')
    phrases[s] = phrases[s].strip('_')

for s in range(len(phrases)):
    try:
        phrases.remove('')
        phrases.remove(' ')
        phrases.remove('/n')
    except:
        pass
    
for t in range(50):
    print(phrases[t])

#print(phrases)
# Probably we'll want to remove stop words from the list of phrases at the end of this.

## TERM FREQUENCY–INVERSE DOCUMENT FREQUENCY (TF-IDF)
![image.png](attachment:image.png)
The quintessential early Natural Language Processing tool, the TF-IDF analysis for context and sentiment evaluation is useful only over a large corpus. It must be understood that the corpus is not just a sample to be evaluated, but instead is the entire population that sets a 'benchmark' for evaluation, if you will. 

Here we establish a Term Frequency (TF) count of word frequencies, just as we showed a phrase frequency count in the last step in this notebook.

## CREATING A WORD LIST and then a DICTIONARY

In [None]:
#Establish wordList
wordList = []
for u in range(len(sentences)):
    for v in sentences[u]:
        words = re.split("\\s+", v)
        wordList.extend(words)
        
#for w in range(50):
    #print(wordList[w])

In [None]:
#Establish wordDict
wordDict = {}
for w in range(len(wordList)):
    newWord = wordList[w]
    newWord = newWord.lower()
    newWord = newWord.replace('.', '')
    wordDict[w] = newWord
#print(wordDict)

## PERFORM WORD COUNT ON THE DICTIONARY

In [None]:

countDict = {}
for x in range(len(wordDict)):
    term = wordDict[x]
    count = 1
    for y in range(len(wordDict)):
        try:
            if wordDict[y].find(term) > 0:
                count += 1
        except:
            pass
        countDict[term] = count

for k, v in countDict.items():
    print(k, v)

In [None]:
# Computes ratio of word's appearances to total words
bow = wordList
bowCount = len(bow) #BOW = Bag of Words
tfDict = {}
for term, count in countDict.items():
    tfDict[term] = count/float(bowCount)

num = dict(sorted(tfDict.items(), key=lambda x: x[1], reverse = True))
#for (k, v in num.items()):
    #print(k, v)

## PERFORM A PHRASE FREQUENCY COUNT

Now we can identify common phrases by performing a frequency count on each phrase.  Moreover, if the corpus is large enough, commonly used phrases will be evident with higher counts across many texts.  For this reason the phrase list along with counts, will be stored in a file.

In [None]:
# Phrase frequency count
from operator import itemgetter
wordfreq = []
for u in range(len(phrases)):
    utterance = phrases[u]
    uttcnt = 0
    uttcnt = phrases.count(utterance)
    if uttcnt > 1:
        wordfreq.append(uttcnt)
    
zipped = list(zip(phrases, wordfreq))
sortzip = sorted(zipped, key=itemgetter(1), reverse=True)
phraseFreqDict = dict(sortzip)


In [None]:
"""
If a sentence is usually 'subject' then 'verb' and then 'noun', we could assume that a good sentence or 
phrase (which could or also could not be a sentence), would have that structure. Simply, the common research topics 
of 'attention' 'sentiment' and 'semantics' match these structural terms in functionality, at least somewhat. 
We use a trifecta of unit vectors initially therefore to represent each of these terms, and apply them in a relatively 
intuitive fashion. Semantics are often considered ordinal and therefore a vector in 3-space could be (1,1,0). Taking a 
cartesian approach to the 3-space, Sentiment could be described as (1,0,1) when flat and Attention has been described 
as nearly orthogonal to sentiment and could therefore be (0,1,1).
"""

import xxhash, numpy
from keras.preprocessing.text import one_hot

phraseDictH = {}
# Estimate the size of the vocabulary
vocabSize = len(phrases)
docSize = len(spoken)
phraseFreqs = phraseFreqDict.values()
phraseVocab = phraseFreqDict.keys() 

# Return a dictionary whose keys are phrase tokens, the values are the indices       
#START HERE
for key,val in phraseFreqDict.items():
    newPhrase = phraseFreqDict[val]
    print(newPhrase)
    # First Hash the Phrases
    hashedPhrase = xxhash.xxh32(newPhrase, seed=60155748).hexdigest()
    # Second, Pad the Hashes if they are NOT in hex (this is worth trying since hex is not ideal for text)
    # Third, Vectorize the Padded or hex Hashes, using an embedding vector set from attention, semantics, 
    # or sentiment
    phraseDictH[z] = hashedPhrase
    
# Last, One-Hot the Vectors
hashSize = len(hashedPhrase)

#print(sortzip[ v(0) ])

array = numpy.zeros( (docSize, hashSize, vocabSize), dtype=float, order='F')
for i in range(26): #This is the Z-axis    
    for j in enumerate(phraseDictH):
        array[i, j, phraseDictH[phraseDictH[z]]] = 1 #Here, the third param must be the IDF?

result = encoder_input_data
#print(result)


In [None]:
## BYTE-LEVEL ENCODING OF THE DICTIONARY

In [None]:
dict = sorted(countDict.items(), key=lambda x: x[1], reverse = True)
length = len(dict)
#print(length)
# Use a 0-255 integer byte level encoding



## Testing the encoding
The TF-IDF and One-Hot encoder, should produce nearly the same output.  One-Hot produces a Gaussian Intensity for a phrase amongst the list of phrases, while the Inverse Document Frequency should do similarly.

## TO BE CONTINUED...
At this point, this notebook is finished. If you are really truly interested in using this code, please refer to Part 2 for a continuation of this process, wherein the code in this notebook is converted into objects and a multiple document corpus is compiled.