In [1]:
import requests
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
import time
import os
import numpy as np
import nltk
import spacy
import string
from unidecode import unidecode
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import builtins
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import random
from wordcloud import WordCloud
from gensim.models import TfidfModel, Phrases, phrases, CoherenceModel
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import gensim.models.ldamodel as LDAModel
from gensim.test.utils import datapath
from gensim.corpora import MmCorpus
pyLDAvis.enable_notebook()
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Scraper
I scraped the data off of the Stanford Encyclopedia of Philosophy at random. I saved the body of each document to a text file.

In [2]:
"""
NUM_DOCS = 3000
directory = 'pages//'
docs = []

def clean_filename(filename):
    filename = unidecode(filename)
    bad_chars = ['‘', '\'']
    for char in bad_chars:
        if char in filename:
            print(filename)
            filename = filename.replace(char, '')
    return filename

while len(os.listdir('pages//')) < NUM_DOCS:
    # load a page
    url = 'https://plato.stanford.edu/cgi-bin/encyclopedia/random'
    page = requests.get(url)
    soup = bs(page.content, 'html.parser')
    
    # Locate the title and clean it up
    # Every title contains the name of the site, so that is removed
    # Forward slashes are replaced with dashes
    title = ' '.join(soup.title.text.strip('\n').split()[:-4]).replace('/', '-')
    title = clean_filename(title)
    # Save the body of the document
    if title not in docs:
        docs.append(title)
        body = soup.find('div', id='main-text')
        # A minority of pages did not have anything in the body
        if body:
            paragraphs = '\n'.join([paragraph.text.strip('\n') for paragraph in body.find_all('p')])
            with open(f'{directory+title}.txt', 'w', encoding='utf8') as f:
                for paragraph in paragraphs:
                    f.write(paragraph)
    count = len(os.listdir('pages//'))
    if not count % 500:
        print(count)
    
    
    # sleep to be nice
    time.sleep(3)"""

"\nNUM_DOCS = 3000\ndirectory = 'pages//'\ndocs = []\n\ndef clean_filename(filename):\n    filename = unidecode(filename)\n    bad_chars = ['‘', ''']\n    for char in bad_chars:\n        if char in filename:\n            print(filename)\n            filename = filename.replace(char, '')\n    return filename\n\nwhile len(os.listdir('pages//')) < NUM_DOCS:\n    # load a page\n    url = 'https://plato.stanford.edu/cgi-bin/encyclopedia/random'\n    page = requests.get(url)\n    soup = bs(page.content, 'html.parser')\n    \n    # Locate the title and clean it up\n    # Every title contains the name of the site, so that is removed\n    # Forward slashes are replaced with dashes\n    title = ' '.join(soup.title.text.strip('\n').split()[:-4]).replace('/', '-')\n    title = clean_filename(title)\n    # Save the body of the document\n    if title not in docs:\n        docs.append(title)\n        body = soup.find('div', id='main-text')\n        # A minority of pages did not have anything in the b

# Read in the Documents

In [3]:
doc_dir = 'pages//'
doc_locations = []
for file in os.listdir(doc_dir):
    if file.split('.')[-1] == 'txt':
        doc_locations.append(doc_dir+file)

In [4]:
raw_docs = []
for loc in doc_locations:
    with open(loc, encoding='utf8') as f:
        raw_docs.append('\n'.join(f.readlines()))
print(f'There are {len(raw_docs)} documents')

There are 1730 documents


# Tokenization
Documents are split into tokens, with each word being a token.

In [5]:
tokenizer = nltk.tokenize.word_tokenize
tokenized_docs = [tokenizer(doc) for doc in tqdm(raw_docs)]

100%|██████████████████████████████████████████████████████████████████████████████| 1730/1730 [02:19<00:00, 12.39it/s]


# Lemmatization
A lemma is the "base" form of a word given the word's part of speech. For example, the lemma of "running" is "run," the lemma of "cares" is "care" and so on. Lemmatization reduces words to their lemmas. Since lemmatization is part of speech dependent, each word must first be tagged with its part of speech.

I use lemmatization here because I am only introduced in the meaning of words regardless of their form. For example, the difference between "cares," "caring," "cared" and so on are meaningless for this application. An added bonus is that lemmatization will reduce the number of unique words in the text.

Note when tagging the part of speech, there is a catch all category of words that are not lemmatized. This catches tokens that would not be lemmatized anyway like numbers, conjunctions and determiners ("all," "every," etc.).

In [None]:
lemmatized_docs = []
lemmatizer = WordNetLemmatizer()

# Takes an untagged document, a sequence of tokens, tags each token in the
# document with a part of speech, and converts those tags into a form that
# the lemmatizer can use.
# Returns a tagged document, a list of tuples in the form (token, PoS)
def tagger(untagged_doc):
    tagged_doc = nltk.pos_tag(untagged_doc)
    formatted_tags = []
    for tok in tagged_doc:
        unformatted_tag = tok[1]
        # Convert POS tags into format usable by nltk
        if unformatted_tag.startswith('N'):
            formatted_tag = wordnet.NOUN
        elif unformatted_tag.startswith('V'):
            formatted_tag = wordnet.VERB
        elif unformatted_tag.startswith('R'):
            formatted_tag = wordnet.ADV
        elif unformatted_tag.startswith('J'):
            formatted_tag = wordnet.ADJ
        # The
        else:
            formatted_tag = unformatted_tag
        formatted_tags.append((tok[0], formatted_tag))
    return formatted_tags
        
    
pos_to_lemmatize = [wordnet.NOUN, wordnet.VERB, wordnet.ADV, wordnet.ADJ]
lemmatized_docs = []

# Select a subset of documents - helps with memory problems
num_docs = 875
random.shuffle(tokenized_docs)
doc_subset = tokenized_docs[:875]
print(f'Selected {len(doc_subset)} documents')

# Tag and lemmatize each document
for tokenized_doc in tqdm(doc_subset):
    # POS tagging
    tagged_doc = tagger(tokenized_doc)
    # Lemmatization
    lemmatized_doc = [lemmatizer.lemmatize(tok[0], tok[1]) for tok in tagged_doc if tok[1] in pos_to_lemmatize]
    lemmatized_docs.append(lemmatized_doc)

print(f'Lemmatized {len(lemmatized_docs)} documents')

Selected 875 documents


 23%|██████████████████▌                                                             | 203/875 [01:47<05:55,  1.89it/s]

# Remove Stopwords, Punctuation, LaTeX, and Short Words.
The guiding principle has been that I only want to keep meaningful words. As such, I remove the following categories:
* Stopwords: words that are commonly used to the point of meaninglessness (ex. "the," "is")
* Punctuation: Not words and not meaningful
* LaTeX: Not words and not meaningful
* Short words: Unlikely to be meaningful. Most should be caught when removing stopwords, but there may be, for example, variables used in the text (ex. "If X is true").

In [None]:
# SEP has a lot of equations, etc. This helper function will help remove them.
def not_latek(tok):
    bad_chars = string.punctuation
    # Don't want to catch, for example eighteenth-century
    bad_chars = bad_chars.replace('-','')
    for char in bad_chars:
        if char in tok:
            return False
    return True


In [None]:
clean_docs = []
min_tok_length = 3
stopwords = nltk.corpus.stopwords.words('english')
stopwords += string.punctuation
bad_toks = set()
for doc in lemmatized_docs:
    clean_doc = []
    for tok in doc:
        if len(tok) > min_tok_length and tok not in stopwords and not_latek(tok):
            clean_doc.append(tok.lower())
    clean_docs.append(clean_doc)

# Data Exploration

In [None]:
corpus = []
for doc in tqdm(clean_docs):
    corpus += doc
print(f'Combined the {len(clean_docs)} docs into a single list of {len(corpus)} words.')


## Document Length

In [None]:
lengths = [len(doc) for doc in clean_docs]
sns.histplot(lengths)

print(f'The mean article length is {int(np.mean(lengths))} words.')
print(f'The shortest article is {np.min(lengths)} and the longest is {np.max(lengths)} words.')

 ## Word Frequencies
 Here, I visualize the 20 most used words across the corpus. This acts as a nice sanity check after processing the data, making sure that most of the top words are meaningful.

In [None]:
counter = Counter(corpus)
most_common = counter.most_common()

x = []
y = []

sns.set(rc={'figure.figsize':(11.7,8.27)})

for word, count in most_common[:20]:
    x.append(word)
    y.append(count)
sns.barplot(x=y, y=x, orient='h')

Most of the top words are the sort of meaningful words one would expect in a philosophy website. Some are fairly generic (ex. "also" and "take"), but they are the minority. I am not concerned with them unless they are over-represented as relevant terms in the LDA model.

# Word Cloud

In [None]:
wc = WordCloud()
wc.generate(' '.join(corpus))
plt.imshow(wc)
plt.grid(None)

# Tf-Idf Models
Term-Frequency-Inverse Document Frequency gives a score that quantifies the  importance of certain words. The score for a word increases for the number of times the word is used in the document, but decreases as the word is used more across different documents in the corpus. As such, terms will have higher scores if they are used frequently in relatively few documents, and those terms will presumably be important.

## Learning Bigrams
An n-gram is a sequence of n consecutive tokens. For the sake of this analysis I consider the most common unigrams (single words, above) and bigrams. I will lose information about word order by using a bag of word model, so grouping together consecutive tokens will prove useful.

As a rule of thumb, a bigram has a more specific meaning than a unigram. For example the word "ninth" tells us little to nothing - this could be in a context like "ninth place" or "ninth person." "Ninth century" will be found in fewer contexts.

There is no inherent reason to stop at bigrams; however, even a trigram analysis is fruitless here.To generate bigrams, I use gensim's Phrases module, which attempts to learn useful bigrams, rather than finding every bigram in a text. I also attempt to find trigrams using the library, but am unable to do so, even with the model tuned liberally.

In [None]:
bigram_phrases = Phrases(clean_docs, min_count=1, threshold=10)

bigram = phrases.Phraser(bigram_phrases)

trigram = phrases.Phrases(bigram[clean_docs], min_count=1, threshold=10)

In [None]:
bigrams = []
bigrams_trigrams_docs = []
trigrams = []

# Find bigrams and trigrams in each document
for doc in clean_docs:
    bigrams_trigrams_doc = []
    for i in trigram[doc]:
        if len(i.split("_")) == 2:
            bigrams.append(i)
            bigrams_trigrams_doc.append(i)
        elif len(i.split("_")) == 3:
            trigrams.append(i)
            bigrams_trigrams_doc.append(i)
        else:
            bigrams_trigrams_doc.append(i)
    bigrams_trigrams_docs.append(bigrams_trigrams_doc)
print(f'Learned {len(set(bigrams))} bigrams and {len(set(trigrams))} trigrams.')


In [None]:
trigrams

An immediate concern here is that, even with parameters set to be very accepting of phrases, we learn no trigrams. 

In [None]:
TOP_X = 20

counter = Counter(bigrams)
most_common = counter.most_common()[:20]

x = [tup[0] for tup in most_common]
y = [tup[1] for tup in most_common]

sns.barplot(x=y, y=x, orientation='horizontal')

The bigrams are somewhat promising with respect to their potential to discriminate between topics. Some of the topics have obviously topic-specific menaings like philosophy_science (philosophy of science) or truth_value (logic).

### Bag of Words Vectorization
A bag of words vectorization is a vector representation of a document (or sentence, paragraph, etc.) that does not take into account token order. Each document is represented as a vector with dimensionality equivalent to the size of the number of unique words in the corpus. Each component in the vector represents a token, and the magnitude of the component represents how many times the token is used in the document.

In [None]:
if 'bigram_bow.mm' not in os.listdir():
    # Create bigram dictionary
    bigram_dictionary = Dictionary(bigrams_trigrams_docs)
    bigram_dictionary.save_as_text('bigram_dict.txt')

    # Create bag of words models from the bigram representations of the documents
    bigram_bow = [bigram_dictionary.doc2bow(doc) for doc in tqdm(bigrams_trigrams_docs)]

    MmCorpus.serialize('bigram_bow.mm', bigram_bow)
else:
    bigram_bow = MmCorpus('bigram_bow.mm')
    bigram_dictionary = Dictionary.load_from_text('bigram_dict.txt')

### Building the Tf-Idf Model

In [None]:
# Create the model
tfidf_bigram = TfidfModel(corpus=bigram_bow, id2word=bigram_dictionary)

#### Visualize tf-idf Score Distribution
Here, I visualize the tf-idf score, so I can pick a reasonable score, below which words will be disposed. Note that this distribution has a very long tail. The limits placed on the x-axis cut off some outliers.

In [None]:
scores = []
idx = []
corpus_length = len(bigram_bow)

with tqdm(total=corpus_length, position=0, leave=True) as pbar:
    for i in tqdm(range(corpus_length), position=0, leave=True):
    #for i in range(len(bigram_bow[:10])):
        bow = bigram_bow[i]
        tfidf = tfidf_bigram[bow]
        for tup in tfidf:
            idx.append(tup[0])
            scores.append(tup[1])
        
hist = sns.histplot(scores,bins=1000)
hist.set(xlim=(0,0.1))
hist.set_xlabel('tf-df Score')
plt.show()

In [None]:
cutoff = 0.02
words = []
missing_words = []
corpus_length = len(bigram_bow)
tf_idf_bow = []
if 'tf_idf_bow.mm' not in os.listdir():
    # Create new bag of words representations of each document, keeping
    # only words above a certain score.
    with tqdm(total=corpus_length, position=0, leave=True) as pbar:
        for i in tqdm(range(corpus_length), position=0, leave=True):
            bow = bigram_bow[i]
            # This list holds all the words
            tfidf_idxs = []
            # This list holds the words with scores below the threshold
            low_val_words = []
            # This list holds the words to keep in the model
            bow_idxs = []
            for idx, value in tfidf_bigram[bow]:
                tfidf_idxs.append(idx)
                if value < cutoff:
                    low_val_words.append(idx)
                else:
                    bow_idxs.append(idx)
            dropped = low_val_words + missing_words
            for item in dropped:
                words.append(bigram_dictionary[item])
            # Missing words are words with zero score
            missing_words = [idx for idx in bow_idxs if idx not in tfidf_idxs]
            # Keeps words above score threshold
            new_bow = [w for w in bow if w[0] not in low_val_words and w[0] not in missing_words]    
            tf_idf_bow.append(new_bow)
        MmCorpus.serialize('tf_idf_bow.mm', tf_idf_bow)
else:
    tf_idf_bow = MmCorpus('tf_idf_bow.mm')
    

# LDA Models

# Unigram

# Bigrams

### Model

### Tune Topics

In [None]:
# Given a number of topics, trains an LDA model if one has not been saved
# Otherwise, loads an equivalent LDA model.
# Returns the LDA model.
def train_lda(topics):
    data_size_folder = f"{len(bigram_bow)} Documents"
    # Use the same name for model folders and files
    model_folder_file = f"lda_model_{len(bigram_bow)}_{topics}"
    
    # Check if any models for a corpus of this size have been saved
    # If not, make a folder to store those models
    if data_size_folder not in os.listdir():
        print(f'Making directory for corpus with {len(bigram_bow)} documents.')
        os.mkdir(data_size_folder)
        
    # Check if any models with a given number of topics have been saved for a corpus of this size
    # If not, make a folder to store the model, train the model, and save it
    if model_folder_file not in os.listdir(data_size_folder):
        print(f'Training a model with {topics} topics on a corpus with {len(bigram_bow)} documents.')
        os.mkdir(os.path.join(data_size_folder, model_folder_file))
        model_loc = os.path.join(data_size_folder,  model_folder_file, model_folder_file)
        lda_model = LDAModel.LdaModel(corpus=tf_idf_bow,
                                                   id2word=bigram_dictionary,
                                                   num_topics=topics,
                                                   chunksize=100,
                                                   update_every=1,
                                                   passes=100,
                                                   alpha='auto',
                                                   random_state=123)
        lda_model.save(model_loc)
    else:
        print(f'Loading trained model for model with {topics} topics.')
        lda_model = LDAModel.LdaModel.load(os.path.join(data_size_folder, model_folder_file, model_folder_file))
    return lda_model


##### Coherence by Topic

In [None]:
lda_by_topic = [train_lda(i) for i in range(21,50)]

In [None]:
corpus_bow = []
for doc in bigram_bow:
    corpus_bow += doc
# Takes a model
# Returns the model's CV score
def coherence_score(lda_model):
    coherence_model = CoherenceModel(model=lda_model,
                                        texts=clean_docs,
                                        coherence='c_uci',
                                        processes=-1)
    with np.errstate(invalid='ignore'):
        score = coherence_model.get_coherence()
    return score

scores = [coherence_score(model) for model in tqdm(lda_by_topic)]

  0%|                                                                                           | 0/29 [00:00<?, ?it/s]

In [None]:
scores

In [None]:
sns.lineplot(range(len(scores)), scores)

In [None]:
clean_docs[1]

### Visualization model

In [None]:
vis = gensimvis.prepare(lda_model,
                       bigram_bow,
                       bigram_dictionary,
                       mds='mmds',
                       R=topics)
vis

In [None]:
# Good topic numbers
# 12!!!!!
# 10
# 6