In [28]:
import requests
import os
import numpy as np
import nltk
import spacy
import string
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from gensim.corpora import Dictionary

# Read in the Documents

In [8]:
def find_docs():
    doc_dir = 'pages//'
    doc_locations = []
    for file in os.listdir(doc_dir):
        if file.split('.')[-1] == 'txt':
            doc_locations.append(doc_dir+file)
    return doc_locations

In [9]:
def load_docs(doc_locations):
    raw_docs = []
    for loc in doc_locations:
        with open(loc, encoding='utf8') as f:
            raw_docs.append('\n'.join(f.readlines()))
    print(f'There are {len(raw_docs)} documents')
    return raw_docs

In [38]:
doc_locations = find_docs()
raw_docs = load_docs(doc_locations)
os.chdir('next_word')

There are 1730 documents


# Tokenization and Cleaning

Here the documents are tokenized into words and cleaned. Cleaning consists of:
* Removing LateX
* Removing Punctuation

In [16]:
tokenized_docs = [word_tokenize(doc) for doc in tqdm(raw_docs)]

100%|██████████████████████████████████████████████████████████████████████████████| 1730/1730 [04:18<00:00,  6.70it/s]


In [20]:
print(tokenized_docs[0][:20])

['Even', 'recognizing', 'some', 'early', 'modern', 'writings', 'on', 'the', 'emotions', 'for', 'what', 'they', 'are', 'is', 'no', 'easy', 'task', '.', 'In', 'part']


In [25]:
# Takes a token and a list of characters
# Returns true if any of those characters is in the token
# Otherwise returns false
def clean_token(tok, bad_chars):
    for char in bad_chars:
        if char in tok:
            return False
    return True

# Takes in a list of tokenized documents
# Returns a list of lowercase tokens with LaTeX and punctuation removed
def clean_docs(tokenized_docs):
    flat_docs = []
    bad_chars = string.punctuation
    bad_chars = bad_chars.replace('-', '')
    for doc in tqdm(tokenized_docs):
        for token in doc:
            token = token.lower()
            if clean_token(token, bad_chars):
                flat_docs.append(token)
    return flat_docs

In [27]:
clean_docs = clean_docs(tokenized_docs)

100%|██████████████████████████████████████████████████████████████████████████████| 1730/1730 [00:40<00:00, 42.44it/s]


# Vectorize Documents

## Create Dictionary

In [37]:
id2word = Dictionary([clean_docs])
vectorized_docs = id2word.doc2idx(clean_docs)
vocab_size = len(id2word.keys())
print(f'Vocabulary has {vocab_size} words.')
print(f'Total text length is {len(vectorized_docs)} tokens.')

Vocabulary has 206356 words.
Total text length is 19667516 tokens.


## Split into Inputs and Targets

In [66]:
context_size = 5

def make_inputs_targets(vectorized_docs, context_size):
    targets = []
    inputs = []
    for i in range(len(vectorized_docs) - context_size):
        inputs.append(vectorized_docs[i:i+context_size])
        targets.append(vectorized_docs[i+context_size])
    return (np.asarray(inputs), np.asarray(targets))

# GloVe Vectors

## Embedding Matrix

### Load the Vectors

In [70]:
#path_to_glove = 'glove.6B.100d.txt'
path_to_glove = 'glove.840B.300d.txt'

embed_index = {}
with open(path_to_glove, encoding='utf8') as f:
    for line in tqdm(f):
        word, vec = line.split(maxsplit=1)
        vec = np.fromstring(vec, sep=' ')
        embed_index[word] = vec

  vec = np.fromstring(vec, sep=' ')
2196017it [09:11, 3985.14it/s]


### Create the Embedding Matrix

In [74]:
embedding_dim = 300
hits = 0
misses = 0
missing = []

vocab_words = list(id2word.items())
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for i, tok in tqdm(vocab_words):
    embedding_vector = embed_index.get(tok)
    if embedding_vector is not None and embedding_vector.shape[0] == 300:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        missing.append(tok)
        misses += 1

print(f'Found vectors for {hits} words.')
print(f'Count not find vectors for {misses} words.')
print(f'In total found vectors for {hits/vocab_size}% of the words')

100%|████████████████████████████████████████████████████████████████████████| 206356/206356 [00:22<00:00, 8995.77it/s]

Found vectors for 84714 words.
Count not find vectors for 121642 words.
In total found vectors for 0.410523561224292% of the words





In [64]:
print(missing[-1])

𝕍m
