# Exploring Gibbon - Process Text

In [1]:
# imports
import os
import json
import spacy

ModuleNotFoundError: No module named 'spacy'

## Preliminary: Working with files

In [None]:
# open a file and write to it
text = "In the second century of the Christian era, the empire of Rome comprehended the fairest part of the earth, and the most civilised portion of mankind. The frontiers of that extensive monarchy were guarded by ancient renown and disciplined valour. The gentle, but powerful, influence of laws and manners had gradually cemented the union of the provinces."
file_name = 'new_file.txt'
with open(file=file_name, encoding='utf-8', mode='w') as f:
    f.write(text)

In [None]:
# open a file to read from it
with open(file=file_name, encoding='utf-8', mode ='r') as f:
    opened_text = f.read()
print(opened_text)

## Preliminary: List Comprehension

In [None]:
# Remember our first function for NLP?
text = "In the second century of the Christian era, the empire of Rome comprehended the fairest part of the earth, and the most civilised portion of mankind. The frontiers of that extensive monarchy were guarded by ancient renown and disciplined valour. The gentle, but powerful, influence of laws and manners had gradually cemented the union of the provinces."
# Convert the string into a list
tokens = text.split(" ")
# Identify stop-words
stop_words = ["in", "the", "of", "and", "that", "by", "but"]
# Create an empty list for tokens that aren't stop words
tokens_no_stops = []
# Iterate through tokens
for token in tokens:
    if token.lower() not in stop_words:
        tokens_no_stops.append(token)
print(tokens_no_stops)

In [None]:
# The for loop can be re-written as a "list comprehension"
text_2 = "In the second century of the Christian era, the empire of Rome comprehended the fairest part of the earth, and the most civilised portion of mankind. The frontiers of that extensive monarchy were guarded by ancient renown and disciplined valour. The gentle, but powerful, influence of laws and manners had gradually cemented the union of the provinces."
# Convert the string into a list
tokens_2 = text_2.split(" ")
# Identify stop-words
stop_words = ["in", "the", "of", "and", "that", "by", "but"]
# List comprehension
tokens_no_stops_2 = [token for token in tokens if not token.lower() in stop_words]
print(tokens_no_stops_2)

## Preliminary: os.listdir

In [None]:
# use os.listdir to list file names in a directory
text_path = "../text/gibbon_decline_and_fall/"
for file_name in os.listdir(text_path):
    print(file_name)

## Process text for analysis using spaCy

Before doing NLP work, most texts will need to be preprocessed in different ways. You may need to **tokenize** the text, remove stopwords, or **lemmatize** the text. What you do in pre-processing depends entirely on what your project is.

Check out [spaCy 101](https://spacy.io/usage/spacy-101)

In [None]:
nlp = spacy.load("en_core_web_sm")

### Simple example 

In [None]:
sample = "IN the second century of the Christian era, the Empire of Rome comprehended the fairest part of the earth, and the most civilised portion of mankind."

In [None]:
doc = nlp(sample)

In [None]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_alpha, token.is_stop)

### Memory issues with spaCy
We will be using spAcy to process Gibbon's chapters, but some of them can be quite long (over 3 million characters). This means that spAcy can run into memory problems. As a solution, I will disable a few of spaCy's functions and raise its default allowed lengh. For an alternative solution, see below.

In [None]:
# Find longest chapter so I know what value to use for nlp.max_length
text_path = "../text/gibbon_decline_and_fall/"
longest = 0
for file_name in os.listdir(text_path):
    with open(text_path + file_name, encoding='utf-8', mode='r') as f:
        raw_text = f.read()
    text_len = len(raw_text)
    if text_len > longest:
        longest = text_len
print(longest)

In [None]:
nlp.disable_pipes('ner', 'parser')
nlp.max_length = 3045039

### Pre-process Gibbon

For our immediate purposes we want to convert the raw text of Gibbon (which is in the form of `strings`) to a list of **lemmas**.

In [None]:
def get_noun_and_verb_lemmas(text):
    """Return a list of noun and verb lemmas from a string"""
    doc = nlp(text)
    tokens = [token for token in doc]
    noun_and_verb_tokens = [token for token in tokens if token.pos_ == 'NOUN' or token.pos_ == 'VERB']
    noun_and_verb_lemmas = [noun_and_verb_token.lemma_ for noun_and_verb_token in noun_and_verb_tokens]
    return noun_and_verb_lemmas

In [None]:
# Takes about 3 mintues
text_path = "../text/gibbon_decline_and_fall/"
gibbon_lemmas = {}
for file_name in os.listdir(text_path):
    chapter_name = file_name[23:29]
    with open(text_path + file_name, encoding='utf-8', mode = 'r') as f:
        raw_text = f.read()
    lemmas = get_noun_and_verb_lemmas(raw_text)
    gibbon_lemmas[chapter_name] = lemmas
        

In [None]:
# Sanity check
print(len(gibbon_lemmas))
print(gibbon_lemmas.keys())
print(gibbon_lemmas['chap01'][:25])

In [None]:
file_path = '../data/'
file_name = 'gibbon_lemmas.json'
with open(file_path + file_name, encoding='utf-8', mode='w') as f:
    json.dump(gibbon_lemmas, f)

### Alternative solution to the memory issue

In [None]:
# Attempt 2: 
text_path = "../text/gibbon_decline_and_fall/"
gibbon_lemmas = {}
for file_name in os.listdir(text_path):
    chapter_name = file_name[23:29]
    with open(text_path + file_name, encoding='utf-8', mode = 'r') as f:
        raw_text = f.read()
    if len(raw_text) < 1000000:  # SpaCy will throw a memory error if a text is more than 1,000,000 characters
        lemmas = get_noun_and_verb_lemmas(raw_text)
        gibbon_lemmas[chapter_name] = lemmas
    else:
        print(f"Long chapter: {chapter_name}")
        lemmas = []
        text_lines = raw_text.split('\n')
        for text_line in text_lines:
            line_lemmas = get_noun_and_verb_lemmas(text_line)
            for line_lemma in line_lemmas:
                lemmas.append(line_lemma)
        gibbon_lemmas[chapter_name] = lemmas