# Exploring Gibbon

In [1]:
# imports
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import spacy
import scattertext as st
from IPython.core.display import HTML

## Pre-process text for analysis using SpaCy

Before doing NLP work, most texts will need to be preprocessed in different ways. You may need to **tokenize** the text, remove stopwords, or **lemmatize** the text. What you do in pre-processing depends entirely on what your project is. 

In [2]:
nlp = spacy.load("en_core_web_sm", disable = ['ner', 'parser'])
nlp.max_length = 3045039

### Sample 

In [None]:
sample = "IN the second century of the Christian era, the Empire of Rome comprehended the fairest part of the earth, and the most civilised portion of mankind."

In [None]:
doc = nlp(sample)

In [None]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

### Pre-process Gibbon

For our immediate purposes we want to convert the raw text of Gibbon (which is in the form of `strings`) to a list of **lemmas**.

In [3]:
def get_noun_and_verb_lemmas(text):
    """Return a list of noun and verb lemmas from a string"""
    doc = nlp(text)
    tokens = [token for token in doc]
    noun_and_verb_tokens = [token for token in tokens if token.pos_ == 'NOUN' or token.pos_ == 'VERB']
    noun_and_verb_lemmas = [noun_and_verb_token.lemma_ for noun_and_verb_token in noun_and_verb_tokens]
    return noun_and_verb_lemmas

In [None]:
text_path = "../text/gibbon_decline_and_fall/"
longest = 0
for file_name in os.listdir(text_path):
    with open(text_path + file_name, encoding='utf-8', mode='r') as f:
        raw_text = f.read()
    text_len = len(raw_text)
    if text_len > longest:
        longest = text_len
print(longest)

In [4]:
# Takes about 3 mintues
text_path = "../text/gibbon_decline_and_fall/"
gibbon_lemmas = {}
for file_name in os.listdir(text_path):
    chapter_name = file_name[23:29]
    with open(text_path + file_name, encoding='utf-8', mode = 'r') as f:
        raw_text = f.read()
    lemmas = get_noun_and_verb_lemmas(raw_text)
    gibbon_lemmas[chapter_name] = lemmas
        

In [None]:
# Attempt 2: 
text_path = "../text/gibbon_decline_and_fall/"
gibbon_lemmas = {}
for file_name in os.listdir(text_path):
    chapter_name = file_name[23:29]
    with open(text_path + file_name, encoding='utf-8', mode = 'r') as f:
        raw_text = f.read()
    if len(raw_text) < 1000000:  # SpaCy will throw a memory error if a text is more than 1,000,000 characters
        lemmas = get_noun_and_verb_lemmas(raw_text)
        gibbon_lemmas[chapter_name] = lemmas
    else:
        print(f"Long chapter: {chapter_name}")
        lemmas = []
        text_lines = raw_text.split('\n')
        for text_line in text_lines:
            line_lemmas = get_noun_and_verb_lemmas(text_line)
            for line_lemma in line_lemmas:
                lemmas.append(line_lemma)
        gibbon_lemmas[chapter_name] = lemmas

In [5]:
# Sanity check
print(len(gibbon_lemmas))
#print(gibbon_lemmas['chap02'])
print(gibbon_lemmas.keys())

71
dict_keys(['chap01', 'chap02', 'chap03', 'chap04', 'chap05', 'chap06', 'chap07', 'chap08', 'chap09', 'chap10', 'chap11', 'chap12', 'chap13', 'chap14', 'chap15', 'chap16', 'chap17', 'chap18', 'chap19', 'chap20', 'chap21', 'chap22', 'chap23', 'chap24', 'chap25', 'chap26', 'chap27', 'chap28', 'chap29', 'chap30', 'chap31', 'chap32', 'chap33', 'chap34', 'chap35', 'chap36', 'chap37', 'chap38', 'chap39', 'chap40', 'chap41', 'chap42', 'chap43', 'chap44', 'chap45', 'chap46', 'chap47', 'chap48', 'chap49', 'chap50', 'chap51', 'chap52', 'chap53', 'chap54', 'chap55', 'chap56', 'chap57', 'chap58', 'chap59', 'chap60', 'chap61', 'chap62', 'chap63', 'chap64', 'chap65', 'chap66', 'chap67', 'chap68', 'chap69', 'chap70', 'chap71'])


In [7]:
file_name = 'gibbon_lemmas.json'
with open(file_name, encoding='utf-8', mode='w') as f:
    json.dump(gibbon_lemmas, f)

## Find the most important words by chapter in Gibbon
For this part we are going to use a library called [scikit-learn](https://scikit-learn.org/stable/). This library is primarily for machine learning, but many of its features are useful for DH work.
Advanced Reading: https://towardsdatascience.com/tf-idf-explained-and-python-sklearn-implementation-b020c5e83275

In [None]:
# The tool I will use here requires a string as input rather than a list, so I convert my docs from lists to strings
gibbon_chap_strings = []
gibbon_chap_names = []
for key, value in gibbon_lemmas.items():
    gibbon_chap_names.append(key)  
    chap_string = ' '.join(value)
    gibbon_chap_strings.append(chap_string)

In [None]:
# transform corpus into a matrix of word counts
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, 
                             use_idf= True, norm=None)
transformed_chaps = vectorizer.fit_transform(gibbon_chap_strings)
transformed_chaps_as_array = transformed_chaps.toarray()

In [None]:
gibbon_key_vocab_by_chap = {}
for chap, chap_name in zip(transformed_chaps_as_array, gibbon_chap_names):
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), chap))
    sorted_tf_idf_tuples = sorted(tf_idf_tuples, key= lambda x: x[1], reverse=True)
    k = chap_name
    v = sorted_tf_idf_tuples[:10]  # only getting the top ten
    gibbon_key_vocab_by_chap[k] = v

In [None]:
for k, v in gibbon_key_vocab_by_chap.items():
    result = k + ' => ' + v[0][0] + ', ' + v[1][0] + ', ' + v[2][0] + ', ' + v[3][0] + ', ' + v[4][0]
    print(result)

In [None]:
# explore vocabulary
gibbon_key_vocab_by_chap['chap01']  # <-- you can investigate other chapters

## Conditional frequency distribution in Gibbon

### Natural Language Toolkit
The **Natural Language Toolkit** (NLTK) is a library used for natural language processing (NLP). If you want to learn more, I highly recommend working through the [NLTK Book](https://www.nltk.org/book/). This resource is a great introduction to NLP specifically and Python more generally.

A **conditional frequency distribution** (cfd) is a collection of word counts for a given condition, i.e. category. Here the category is separate chapters in Gibbon. We can chart what used are used most frequently by chapter. This will tell us something about the nature of each chapter.

In [None]:
import nltk
import matplotlib.pyplot as plt

In [None]:
# conditional frequency distribution
cfd = nltk.ConditionalFreqDist(
    (target, chap_name)
    for chap_name in gibbon_lemmas.keys()
    for lemma in gibbon_lemmas[chap_name]
    for target in ['doctrine', 'apostle', 'presbyter', 'daemon', 'immortality']  # <-- instert token(s) to explore (lowercase)
    if lemma.lower().startswith(target)
)
# display plot
plt.figure(figsize=(20, 8))  # this expands the plot to make it more readable
cfd.plot()

### Activity
Based on the key vocabulary by chapter above, explore the use of different terms in the conditional frequency distribution. 
* What questions about the text does this raise for you?
* What hypotheses about the text can you form?

## ScatterText
[ScatterText](https://github.com/JasonKessler/scattertext) is a python library used to visually compare texts according to two categories.

**Technical note**: Due to the large corpora we will be comparing, I have made adjustments to [spaCy](https://spacy.io/) to reduce processing time.

In [None]:
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser'])

### Compare Chapters in Gibbon

In [None]:
vol_1 = []
for chap in gibbon_chap_strings[:16]:  # remeber: final number of a slice is exclusive
    vol_1.append(chap)

vol_6 = []
for chap in gibbon_chap_strings[57:]:
    vol_6.append(chap)


**Note**: ScatterText requires data in a [Pandas](https://pandas.pydata.org/docs/#) dataframe.

In [None]:
chap15_df = pd.DataFrame(data={'chapter': 'chapter 15', 'text': gibbon_lemmas['chap15']})
chap27_df = pd.DataFrame(data={'chapter': 'chapter 27', 'text': gibbon_lemmas['chap27']})
df = chap15_df.append(chap27_df)

In [None]:
# sanity check
df.head()

In [None]:
corpus = st.CorpusFromPandas(df, category_col='chapter', text_col='text').build()

In [None]:
html = st.produce_scattertext_explorer(corpus, category='chapter 15',
                                       category_name='chapter 15',
                                       not_category_name='chapter 27',
                                       width_in_pixels=900)
HTML(html)

In [None]:
# download interactive html
with open('./scattertext.html', encoding='utf8', mode='w') as f:
    f.write(html)
#f.download('./scattertext.html')

## Compare Gibbon and Hume

### Prepare Hume text

In [None]:
hume_corpus = PlaintextCorpusReader('./18th-century-historians/hume/hume-history-of-england/', '.*\.txt')
print(hume_corpus.fileids()[:5])  # Just the first 5

In [None]:
# This will take about 3 minutes
hume_docs = []
for fileid in hume_corpus.fileids():
  raw_text = hume_corpus.raw(fileid)
  lemmas = pre_process(raw_text)
  hume_docs.append(lemmas)

In [None]:
hume_doc_strings = []
for doc in hume_docs:
  string = ' '.join(doc)
  hume_doc_strings.append(string)

In [None]:
gibbon_df = pd.DataFrame(data={'author': 'Gibbon', 'text': gibbon_doc_strings})
hume_df = pd.DataFrame(data={'author': 'Hume', 'text': hume_doc_strings})
author_df = gibbon_df.append(hume_df)

In [None]:
# about 3 min
author_corpus = st.CorpusFromPandas(author_df,
                                    category_col='author',
                                    text_col='text',
                                    nlp=nlp,
                                    ).build()

In [None]:
html = st.produce_scattertext_explorer(author_corpus, category='Gibbon',
                                       category_name='Gibbon',
                                       not_category_name='Hume',
                                       width_in_pixels=900)
HTML(html)