# Small corpus exploration

How to process text?
- Casing
- Non-word and sentence structure tokens
- Punctuation (including quotations, hyphens, and apostrophes)
- Typos
- Digits
- Empty strings

Final corpus unique word count: 9944

In [1]:
import re

In [None]:
def load_gutenberg_text(url):
    import urllib.request
    with urllib.request.urlopen(url) as resp:
        return resp.read().decode("utf-8", errors="ignore")


In [None]:
URL = "https://www.gutenberg.org/files/98/98-0.txt"
text = load_gutenberg_text(URL)

## Text corpus without non-letter characters

In [91]:
WORD_RE = re.compile(r"[A-Za-z]+")

def tokenize(text):
    out = []
    for m in WORD_RE.finditer(text):
        out.append(m.group(0).lower())

    return out

all_words_text = tokenize(text)
unique_words_text = set(all_words_text)

unique_words_text = list(unique_words_text)
unique_words_text.sort()
unique_words_text


['a',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abashed',
 'abate',
 'abated',
 'abbaye',
 'abed',
 'abhorrence',
 'abide',
 'abided',
 'abiding',
 'abilities',
 'ability',
 'abject',
 'ablaze',
 'able',
 'abnegating',
 'aboard',
 'abode',
 'abolished',
 'abolishing',
 'abolition',
 'abominable',
 'abounding',
 'about',
 'above',
 'abreast',
 'abridge',
 'abroad',
 'abrupt',
 'abruptly',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absolving',
 'absorbed',
 'absorption',
 'abstractedly',
 'abstraction',
 'absurd',
 'abundance',
 'abundant',
 'abuse',
 'abused',
 'abyss',
 'abyssinia',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'acceptation',
 'accepted',
 'accepting',
 'access',
 'accessed',
 'accessible',
 'accessories',
 'accident',
 'accidental',
 'accidentally',
 'acclamation',
 'acclamations',
 'accommodation',
 'accompanied',
 'accompaniment',
 'accompany',
 'accompanying',
 'accomplices',
 'accomplished',
 'accomplishing',
 'accomplishme

In [92]:
# Quick manual search yields many words with similar stems --> stemming?
print(unique_words_text[95:100])
print(unique_words_text[100:104])
print(unique_words_text[131:135])
print(unique_words_text[135:139])


['achieve', 'achieved', 'achievement', 'achievements', 'achieving']
['acknowledge', 'acknowledged', 'acknowledgment', 'acknowledgments']
['addition', 'additional', 'additionally', 'additions']
['address', 'addressed', 'addresses', 'addressing']


## Apostrophes

Only 3 words with apostrophes

In [97]:
WORD_RE = re.compile(r"[A-Za-z0-9]+('[A-Za-z0-9]+)")

def tokenize(text):
    out = []
    for m in WORD_RE.finditer(text):
        out.append(m.group(0).lower())

    return out

all_words_apos = tokenize(text)
unique_words_apos = set(all_words_apos)

unique_words_apos = list(unique_words_apos)
unique_words_apos.sort()
unique_words_apos


["foundation's", "state's", "tm's"]

## Digits

No sensical digits --> can omit.

In [93]:
WORD_RE = re.compile(r"\b(?=\w*\d)\w+\b")

def tokenize(text):
    out = []
    for m in WORD_RE.finditer(text):
        out.append(m.group(0).lower())

    return out

all_words_digits = tokenize(text)
unique_words_digits = set(all_words_digits)

unique_words_digits

{'0',
 '000',
 '1',
 '1500',
 '1757',
 '1767',
 '1792',
 '1887',
 '1994',
 '2',
 '20',
 '2001',
 '2020',
 '21',
 '3',
 '30',
 '4',
 '5',
 '50',
 '501',
 '596',
 '6',
 '60',
 '6221541',
 '64',
 '7',
 '8',
 '801',
 '809',
 '84116',
 '9',
 '90',
 '98'}

## Hyphenated words
Hyphenated words may merit more attention, but for now we will split them into separate words.

In [99]:
WORD_RE = re.compile(r"\b\w+-\w+\b")

def tokenize(text):
    out = []
    for m in WORD_RE.finditer(text):
        out.append(m.group(0).lower())

    return out

all_words_hyphen = tokenize(text)
unique_words_hyphen = set(all_words_hyphen)
unique_words_hyphen

{'596-1887',
 '64-6221541',
 '98-0',
 'a-a',
 'a-business',
 'a-buzz',
 'a-tiptoe',
 'al-ways',
 'alarm-bells',
 'ale-house',
 'ale-houses',
 'almost-child',
 'always-vain',
 'ante-chambers',
 'anti-climax',
 'area-railings',
 'arm-chair',
 'arm-chest',
 'as-is',
 'attorney-general',
 'awe-stricken',
 'b-u',
 'bachelor-fashion',
 'bank-notes',
 'banking-house',
 'bare-armed',
 'bare-breasted',
 'bare-foot',
 'bare-legged',
 'bear-leader',
 'bed-chamber',
 'bed-gown',
 'bed-winches',
 'beer-drinking',
 'bell-ringing',
 'bench-walk',
 'black-haired',
 'blood-money',
 'blood-vessels',
 'bloody-minded',
 'blotting-paper',
 'blue-flies',
 'blue-mould',
 'boar-spears',
 'body-women',
 'boot-cleaning',
 'branding-iron',
 'bread-and',
 'breakfast-hour',
 'breakfast-table',
 'broken-backed',
 'broken-hearted',
 'brown-haired',
 'bull-dog',
 'bull-necked',
 'burial-ground',
 'burial-place',
 'burial-places',
 'business-absorption',
 'business-like',
 'business-meaning',
 'butt-end',
 'butt-ends'

## Final processing

No digits, no punctuation, all lowercase, hyphenated words separated, no empty strings, no sentence structure tokens, typos included and induced by processing (e.g., linebreak typos).

In [105]:
WORD_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")

def tokenize(text):
    out = []
    for m in WORD_RE.finditer(text):
        out.append(m.group(0).lower())

    return out

all_words = tokenize(text)
unique_words = set(all_words)
unique_words

{'promises',
 'constituted',
 'sir',
 'abuse',
 'display',
 'countless',
 'imitation',
 'credit',
 'calculated',
 'expense',
 'de',
 'sad',
 'exposed',
 'goodwill',
 'wielders',
 'hall',
 'monsters',
 'dormer',
 'florid',
 'purveyors',
 'streaming',
 'honoured',
 'alternate',
 'defarge',
 'haughty',
 'questioned',
 'impartially',
 'loathsome',
 'strong',
 'individual',
 'bachelor',
 'guillotine',
 'joined',
 'sheets',
 'giants',
 'perpetuation',
 'by',
 'indeed',
 'fancies',
 'screaming',
 'footstep',
 'society',
 'talk',
 'winding',
 'inflating',
 'project',
 'lasts',
 'method',
 'interment',
 'amidst',
 'tortures',
 'turbulently',
 'pulleys',
 'porter',
 'exaggerated',
 'injury',
 'assorted',
 'instead',
 'spade',
 'threw',
 'dialogue',
 'reading',
 'rescue',
 'desolately',
 'unfounded',
 'murmured',
 'hauled',
 'cant',
 'omened',
 'singly',
 'repast',
 'proper',
 'smoked',
 'lame',
 'lawyer',
 'tri',
 'named',
 'repose',
 'saucers',
 'grimness',
 'lands',
 'names',
 'beating',
 'doo

In [106]:
len(unique_words)

9944