# 01: Quoth The Raven NLG - Data Loading and Preprocessing

---

In this notebook, load in the text files for the complete works of Edgar Allan Poe and prepare the text for training natural language generating models.

* Section A: Loading and Cleaning Text Files
* Section B: Generating Word-Level Tokens of the Text Data
* Section C: Generating Sequences of Tokens and Saving to a File for Modeling and Language Generation

### Imports:

In [51]:
import os
import re
from unidecode import unidecode
from keras.preprocessing.text import text_to_word_sequence, hashing_trick

### <a name="load"></a>Section A: Loading and Cleaning the Text Files 

In the data folder, you'll notice a few different items:
* Original text files

In [71]:
def load_corpus(path, file_encoding=None):
    
    # make a list of all the text files in the specified directory
    text_files = [file for file in os.listdir(path)]
    
    print(f'The following {len(text_files)} have been loaded:')
    for _ in range(len(text_files)):
        print(text_files[_])
    
    # create variable to hold the text from our combined documents
    loaded_text = ''
    
    # open and append the file contents to our com
    for file in text_files:
        loaded_text += open(path + file, encoding=file_encoding).read()
        loaded_text += ' '
    
    # gets rid of extra spaces due to project gutenberg formatting
    loaded_text = ' '.join(loaded_text.split())
    
    # converting to ASCII to get rid of smart quotes and some special characters
    loaded_text = unidecode(loaded_text)
    
    print()
    print(f'The length of the combined documents (in characters) is: {len(loaded_text)}')
    
    return loaded_text

In [72]:
raw_text = load_corpus('./data/Poe_NLG/02_Poe_author_text_only/Prose/', 'utf-8')

The following 5 have been loaded:
CompletePoeVol3-trimmed.txt
CompletePoeVol4-trimmed.txt
CompletePoeVol1-trimmed.txt
CompletePoeVol5-prose-trimmed.txt
CompletePoeVol2-trimmed.txt

The length of the combined documents (in characters) is: 2296101


In [52]:
text_files = [file for file in os.listdir('./data/Poe_NLG/02_Poe_author_text_only/Prose/')]
print(text_files)
raw_text = ''

for file in text_files:
    raw_text += open(f'./data/Poe_NLG/02_Poe_author_text_only/Prose/{file}', encoding='utf-8').read()
    raw_text += ' '
raw_text = ' '.join(raw_text.split())
raw_text = unidecode(raw_text)
len(raw_text)

['CompletePoeVol3-trimmed.txt', 'CompletePoeVol4-trimmed.txt', 'CompletePoeVol1-trimmed.txt', 'CompletePoeVol5-prose-trimmed.txt', 'CompletePoeVol2-trimmed.txt']


2296101

In [53]:
chars = sorted(list(set(raw_text)))
print(chars)

[' ', '!', '"', '$', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '~']


In [54]:
raw_text_v2 = re.sub('[^A-Za-z!?.,"\':;-]+',' ',raw_text)

In [55]:
raw_text_v2 = raw_text_v2.replace('--', '&')

In [56]:
punctuation_as_word = ['!', '?', '.', ',', '"', "'", ':', ';', '&', '-']
for punc in punctuation_as_word:
    raw_text_v2 = raw_text_v2.replace(punc, f' {punc} ')

raw_text_v2 =  raw_text_v2.replace(' & ', ' -- ')


In [58]:
raw_text_v2 = re.sub('\s\s+', ' ', raw_text_v2)

In [60]:
tokens = raw_text_v2.split()

In [61]:
tokens[:10]

['Upon', 'my', 'return', 'to', 'the', 'United', 'States', 'a', 'few', 'months']

In [40]:
len(tokens)

399322

In [42]:
len(set(tokens))

43940

In [62]:
# create sequences of tokens to use with Keras Tokenizer class for use in modeling
seq_input_len = 3
seq_tot_len = seq_input_len + 1
sequences = []
for i in range(seq_tot_len, len(tokens)):
    token_seq = tokens[i-seq_tot_len:i]
    sequence = ' '.join(token_seq)
    sequences.append(sequence)

print(f'Total Sequences created: {len(sequences)}')

Total Sequences created: 480066


In [63]:
# save cleaned and prepped sequences
def save_sequences_to_file(sequences, filename):
    sequence_lines = '\n'.join(sequences)
    file = open(f'./data/Poe_NLG/03_Text_objects_for_models/{filename}', 'w')
    file.write(sequence_lines)
    file.close()

save_sequences_to_file(sequences, f'cleaned_poe_tot_seq_len_{seq_tot_len}')

TypeError: 'NoneType' object is not callable