# Modules

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_colwidth=1000
import matplotlib.pyplot as plt
#plt.style.use('dark_background')
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, NLTKWordTokenizer, wordpunct_tokenize
from nltk.tokenize.regexp import RegexpTokenizer, WordPunctTokenizer, WhitespaceTokenizer, BlanklineTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import sys
sys.path.insert(1, '/home/mauricio/code/mcr')
from mcr.nlp import token_count, CUSTOM_WORD_TOKENIZER_REGEX, SKLEARN_WORD_TOKENIZER_REGEX

from mcr.nlp import WordTokenizer, SentenceTokenizer
from mcr.nlp import sentence_count, word_count

In [2]:
text = ''
text += '''
Punkt knows that the periods in Mr. Smith and Johann S. Bach
do not mark sentence boundaries.  And sometimes sentences
can start with non-capitalized words.  i is a good variable
name.
'''
text += '''
(How does it deal with this parenthesis?)  "It should be part of the
previous sentence." "(And the same with this one.)" ('And this one!')
"('(And (this)) '?)" [(and this. )]
'''
text += 'Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks.'
print(text)


Punkt knows that the periods in Mr. Smith and Johann S. Bach
do not mark sentence boundaries.  And sometimes sentences
can start with non-capitalized words.  i is a good variable
name.

(How does it deal with this parenthesis?)  "It should be part of the
previous sentence." "(And the same with this one.)" ('And this one!')
"('(And (this)) '?)" [(and this. )]
Good muffins cost $3.88
in New York.  Please buy me
two of them.

Thanks.


# Tokenizers

## Sentences

Based on nltk.sent_tokenize (pre-trained PunktSentenceTokenizer) 

In [3]:
display(SentenceTokenizer().tokenize(text))
(SentenceTokenizer().tokenize(text) == SentenceTokenizer('english').tokenize(text)) &\
(SentenceTokenizer().tokenize(text) == SentenceTokenizer(language='english').tokenize(text)) &\
(SentenceTokenizer().tokenize(text) == sent_tokenize(text)) &\
(SentenceTokenizer().tokenize(text) == sent_tokenize(text, 'english')) &\
(SentenceTokenizer().tokenize(text) == sent_tokenize(text, language='english'))

['\nPunkt knows that the periods in Mr. Smith and Johann S. Bach\ndo not mark sentence boundaries.',
 'And sometimes sentences\ncan start with non-capitalized words.',
 'i is a good variable\nname.',
 '(How does it deal with this parenthesis?)',
 '"It should be part of the\nprevious sentence."',
 '"(And the same with this one.)"',
 "('And this one!')",
 '"(\'(And (this)) \'?)"',
 '[(and this. )]',
 'Good muffins cost $3.88\nin New York.',
 'Please buy me\ntwo of them.',
 'Thanks.']

True

## Blank line paragraphs

In [4]:
# disabling sentence in WordTokenizer(preserve_line=True) and using BlankTokenizer() to separate paragraṕhs
display(WordTokenizer(tokenizer=BlanklineTokenizer(), preserve_line=True).tokenize(text))
(WordTokenizer(tokenizer=BlanklineTokenizer(), preserve_line=True).tokenize(text) == BlanklineTokenizer().tokenize(text))

['\nPunkt knows that the periods in Mr. Smith and Johann S. Bach\ndo not mark sentence boundaries.  And sometimes sentences\ncan start with non-capitalized words.  i is a good variable\nname.',
 '(How does it deal with this parenthesis?)  "It should be part of the\nprevious sentence." "(And the same with this one.)" (\'And this one!\')\n"(\'(And (this)) \'?)" [(and this. )]\nGood muffins cost $3.88\nin New York.  Please buy me\ntwo of them.',
 'Thanks.']

True

## Words from Sentences

### Regular expressions

In [5]:
WordTokenizer().tokenize(text)
# same as
WordTokenizer(pattern=CUSTOM_WORD_TOKENIZER_REGEX).tokenize(text)
# same as
print(WordTokenizer(tokenizer=RegexpTokenizer(CUSTOM_WORD_TOKENIZER_REGEX)).tokenize(text))

(WordTokenizer().tokenize(text) == WordTokenizer(pattern=CUSTOM_WORD_TOKENIZER_REGEX).tokenize(text)) &\
(WordTokenizer().tokenize(text) == WordTokenizer(tokenizer=RegexpTokenizer(CUSTOM_WORD_TOKENIZER_REGEX)).tokenize(text)) &\
(WordTokenizer().tokenize(text) == WordTokenizer(tokenizer=RegexpTokenizer(CUSTOM_WORD_TOKENIZER_REGEX, gaps=False)).tokenize(text))

['Punkt', 'knows', 'that', 'the', 'periods', 'in', 'Mr', 'Smith', 'and', 'Johann', 'S', 'Bach', 'do', 'not', 'mark', 'sentence', 'boundaries', 'And', 'sometimes', 'sentences', 'can', 'start', 'with', 'non', 'capitalized', 'words', 'i', 'is', 'a', 'good', 'variable', 'name', 'How', 'does', 'it', 'deal', 'with', 'this', 'parenthesis', 'It', 'should', 'be', 'part', 'of', 'the', 'previous', 'sentence', 'And', 'the', 'same', 'with', 'this', 'one', 'And', 'this', 'one', 'And', 'this', 'and', 'this', 'Good', 'muffins', 'cost', '3', '88', 'in', 'New', 'York', 'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks']


True

### NLTKWordTokenizer/TreebankWordTokenizer

In [6]:
print(WordTokenizer(tokenizer=NLTKWordTokenizer()).tokenize(text))
word_tokenize(text) == WordTokenizer(tokenizer=NLTKWordTokenizer()).tokenize(text)

['Punkt', 'knows', 'that', 'the', 'periods', 'in', 'Mr.', 'Smith', 'and', 'Johann', 'S.', 'Bach', 'do', 'not', 'mark', 'sentence', 'boundaries', '.', 'And', 'sometimes', 'sentences', 'can', 'start', 'with', 'non-capitalized', 'words', '.', 'i', 'is', 'a', 'good', 'variable', 'name', '.', '(', 'How', 'does', 'it', 'deal', 'with', 'this', 'parenthesis', '?', ')', '``', 'It', 'should', 'be', 'part', 'of', 'the', 'previous', 'sentence', '.', "''", '``', '(', 'And', 'the', 'same', 'with', 'this', 'one', '.', ')', "''", '(', "'And", 'this', 'one', '!', "'", ')', '``', '(', "'", '(', 'And', '(', 'this', ')', ')', "'", '?', ')', "''", '[', '(', 'and', 'this', '.', ')', ']', 'Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']


True

## Words without Sentences

### Regular expressions

In [7]:
WordTokenizer(preserve_line=True).tokenize(text)
# same as
WordTokenizer(pattern=CUSTOM_WORD_TOKENIZER_REGEX, preserve_line=True).tokenize(text)
# same as
print(WordTokenizer(tokenizer=RegexpTokenizer(CUSTOM_WORD_TOKENIZER_REGEX), preserve_line=True).tokenize(text))

(WordTokenizer(preserve_line=True).tokenize(text) == WordTokenizer(pattern=CUSTOM_WORD_TOKENIZER_REGEX, preserve_line=True).tokenize(text)) &\
(WordTokenizer(preserve_line=True).tokenize(text) == WordTokenizer(tokenizer=RegexpTokenizer(CUSTOM_WORD_TOKENIZER_REGEX), preserve_line=True).tokenize(text)) &\
(WordTokenizer(preserve_line=True).tokenize(text) == WordTokenizer(tokenizer=RegexpTokenizer(CUSTOM_WORD_TOKENIZER_REGEX, gaps=False), preserve_line=True).tokenize(text))

['Punkt', 'knows', 'that', 'the', 'periods', 'in', 'Mr', 'Smith', 'and', 'Johann', 'S', 'Bach', 'do', 'not', 'mark', 'sentence', 'boundaries', 'And', 'sometimes', 'sentences', 'can', 'start', 'with', 'non', 'capitalized', 'words', 'i', 'is', 'a', 'good', 'variable', 'name', 'How', 'does', 'it', 'deal', 'with', 'this', 'parenthesis', 'It', 'should', 'be', 'part', 'of', 'the', 'previous', 'sentence', 'And', 'the', 'same', 'with', 'this', 'one', 'And', 'this', 'one', 'And', 'this', 'and', 'this', 'Good', 'muffins', 'cost', '3', '88', 'in', 'New', 'York', 'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks']


True

### NLTKWordTokenizer/TreebankWordTokenizer

In [8]:
print(WordTokenizer(tokenizer=NLTKWordTokenizer(), preserve_line=True).tokenize(text))
NLTKWordTokenizer().tokenize(text) == WordTokenizer(tokenizer=NLTKWordTokenizer(), preserve_line=True).tokenize(text)

['Punkt', 'knows', 'that', 'the', 'periods', 'in', 'Mr.', 'Smith', 'and', 'Johann', 'S.', 'Bach', 'do', 'not', 'mark', 'sentence', 'boundaries.', 'And', 'sometimes', 'sentences', 'can', 'start', 'with', 'non-capitalized', 'words.', 'i', 'is', 'a', 'good', 'variable', 'name.', '(', 'How', 'does', 'it', 'deal', 'with', 'this', 'parenthesis', '?', ')', '``', 'It', 'should', 'be', 'part', 'of', 'the', 'previous', 'sentence.', "''", '``', '(', 'And', 'the', 'same', 'with', 'this', 'one.', ')', "''", '(', "'And", 'this', 'one', '!', "'", ')', "''", '(', "'", '(', 'And', '(', 'this', ')', ')', "'", '?', ')', "''", '[', '(', 'and', 'this.', ')', ']', 'Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']


True

# Tokenization count

In [9]:
# Default custom sentence and word tokenizer
print(token_count(text))

{'and': 6, 'this': 5, 'the': 3, 'with': 3, 'good': 2, 'in': 2, 'it': 2, 'of': 2, 'one': 2, 'sentence': 2, '3': 1, '88': 1, 'a': 1, 'bach': 1, 'be': 1, 'boundaries': 1, 'buy': 1, 'can': 1, 'capitalized': 1, 'cost': 1, 'deal': 1, 'do': 1, 'does': 1, 'how': 1, 'i': 1, 'is': 1, 'johann': 1, 'knows': 1, 'mark': 1, 'me': 1, 'mr': 1, 'muffins': 1, 'name': 1, 'new': 1, 'non': 1, 'not': 1, 'parenthesis': 1, 'part': 1, 'periods': 1, 'please': 1, 'previous': 1, 'punkt': 1, 's': 1, 'same': 1, 'sentences': 1, 'should': 1, 'smith': 1, 'sometimes': 1, 'start': 1, 'thanks': 1, 'that': 1, 'them': 1, 'two': 1, 'variable': 1, 'words': 1, 'york': 1}


In [10]:
# No sentence, 1+ chararacters
print(token_count(text, tokenizer=None))

{'and': 6, 'this': 5, 'the': 3, 'with': 3, 'good': 2, 'in': 2, 'it': 2, 'of': 2, 'one': 2, 'sentence': 2, '3': 1, '88': 1, 'a': 1, 'bach': 1, 'be': 1, 'boundaries': 1, 'buy': 1, 'can': 1, 'capitalized': 1, 'cost': 1, 'deal': 1, 'do': 1, 'does': 1, 'how': 1, 'i': 1, 'is': 1, 'johann': 1, 'knows': 1, 'mark': 1, 'me': 1, 'mr': 1, 'muffins': 1, 'name': 1, 'new': 1, 'non': 1, 'not': 1, 'parenthesis': 1, 'part': 1, 'periods': 1, 'please': 1, 'previous': 1, 'punkt': 1, 's': 1, 'same': 1, 'sentences': 1, 'should': 1, 'smith': 1, 'sometimes': 1, 'start': 1, 'thanks': 1, 'that': 1, 'them': 1, 'two': 1, 'variable': 1, 'words': 1, 'york': 1}


In [11]:
# No sentence, 2+ characters (as sklearn does)
print(token_count(text, tokenizer=None, token_pattern=SKLEARN_WORD_TOKENIZER_REGEX))

{'and': 6, 'this': 5, 'the': 3, 'with': 3, 'good': 2, 'in': 2, 'it': 2, 'of': 2, 'one': 2, 'sentence': 2, '88': 1, 'bach': 1, 'be': 1, 'boundaries': 1, 'buy': 1, 'can': 1, 'capitalized': 1, 'cost': 1, 'deal': 1, 'do': 1, 'does': 1, 'how': 1, 'is': 1, 'johann': 1, 'knows': 1, 'mark': 1, 'me': 1, 'mr': 1, 'muffins': 1, 'name': 1, 'new': 1, 'non': 1, 'not': 1, 'parenthesis': 1, 'part': 1, 'periods': 1, 'please': 1, 'previous': 1, 'punkt': 1, 'same': 1, 'sentences': 1, 'should': 1, 'smith': 1, 'sometimes': 1, 'start': 1, 'thanks': 1, 'that': 1, 'them': 1, 'two': 1, 'variable': 1, 'words': 1, 'york': 1}


In [12]:
# No sentence, NLTKWordTokenizer
print(token_count(text, tokenizer=NLTKWordTokenizer()))

{'(': 7, ')': 7, 'and': 5, "''": 4, 'this': 4, "'": 3, 'the': 3, 'with': 3, '?': 2, '``': 2, 'good': 2, 'in': 2, 'it': 2, 'of': 2, '!': 1, '$': 1, "'and": 1, '.': 1, '3.88': 1, '[': 1, ']': 1, 'a': 1, 'bach': 1, 'be': 1, 'boundaries.': 1, 'buy': 1, 'can': 1, 'cost': 1, 'deal': 1, 'do': 1, 'does': 1, 'how': 1, 'i': 1, 'is': 1, 'johann': 1, 'knows': 1, 'mark': 1, 'me': 1, 'mr.': 1, 'muffins': 1, 'name.': 1, 'new': 1, 'non-capitalized': 1, 'not': 1, 'one': 1, 'one.': 1, 'parenthesis': 1, 'part': 1, 'periods': 1, 'please': 1, 'previous': 1, 'punkt': 1, 's.': 1, 'same': 1, 'sentence': 1, 'sentence.': 1, 'sentences': 1, 'should': 1, 'smith': 1, 'sometimes': 1, 'start': 1, 'thanks': 1, 'that': 1, 'them.': 1, 'this.': 1, 'two': 1, 'variable': 1, 'words.': 1, 'york.': 1}


In [13]:
# No sentence, WordPunctTokenizer
print(token_count(text, tokenizer=WordPunctTokenizer()))

{'.': 10, 'and': 6, 'this': 5, 'the': 3, 'with': 3, '(': 2, 'good': 2, 'in': 2, 'it': 2, 'of': 2, 'one': 2, 'sentence': 2, "!')": 1, '"': 1, '"(': 1, '"(\'(': 1, '$': 1, '\'?)"': 1, "('": 1, '))': 1, ')]': 1, '-': 1, '."': 1, '.)"': 1, '3': 1, '88': 1, '?)': 1, '[(': 1, 'a': 1, 'bach': 1, 'be': 1, 'boundaries': 1, 'buy': 1, 'can': 1, 'capitalized': 1, 'cost': 1, 'deal': 1, 'do': 1, 'does': 1, 'how': 1, 'i': 1, 'is': 1, 'johann': 1, 'knows': 1, 'mark': 1, 'me': 1, 'mr': 1, 'muffins': 1, 'name': 1, 'new': 1, 'non': 1, 'not': 1, 'parenthesis': 1, 'part': 1, 'periods': 1, 'please': 1, 'previous': 1, 'punkt': 1, 's': 1, 'same': 1, 'sentences': 1, 'should': 1, 'smith': 1, 'sometimes': 1, 'start': 1, 'thanks': 1, 'that': 1, 'them': 1, 'two': 1, 'variable': 1, 'words': 1, 'york': 1}


In [14]:
# No sentence, WhitespaceTokenizer
print(token_count(text, tokenizer=WhitespaceTokenizer()))

{'the': 3, 'this': 3, 'with': 3, 'and': 2, 'good': 2, 'in': 2, 'of': 2, '"(\'(and': 1, '"(and': 1, '"it': 1, '$3.88': 1, '\'?)"': 1, "('and": 1, '(how': 1, '(this))': 1, ')]': 1, '[(and': 1, 'a': 1, 'bach': 1, 'be': 1, 'boundaries.': 1, 'buy': 1, 'can': 1, 'cost': 1, 'deal': 1, 'do': 1, 'does': 1, 'i': 1, 'is': 1, 'it': 1, 'johann': 1, 'knows': 1, 'mark': 1, 'me': 1, 'mr.': 1, 'muffins': 1, 'name.': 1, 'new': 1, 'non-capitalized': 1, 'not': 1, "one!')": 1, 'one.)"': 1, 'parenthesis?)': 1, 'part': 1, 'periods': 1, 'please': 1, 'previous': 1, 'punkt': 1, 's.': 1, 'same': 1, 'sentence': 1, 'sentence."': 1, 'sentences': 1, 'should': 1, 'smith': 1, 'sometimes': 1, 'start': 1, 'thanks.': 1, 'that': 1, 'them.': 1, 'this.': 1, 'two': 1, 'variable': 1, 'words.': 1, 'york.': 1}


In [15]:
# No sentence, BlanklineTokenizer
print(token_count(text, tokenizer=BlanklineTokenizer()))

{'\npunkt knows that the periods in mr. smith and johann s. bach\ndo not mark sentence boundaries.  and sometimes sentences\ncan start with non-capitalized words.  i is a good variable\nname.': 1, '(how does it deal with this parenthesis?)  "it should be part of the\nprevious sentence." "(and the same with this one.)" (\'and this one!\')\n"(\'(and (this)) \'?)" [(and this. )]\ngood muffins cost $3.88\nin new york.  please buy me\ntwo of them.': 1, 'thanks.': 1}


In [16]:
# Just sentences
print(token_count(text, tokenizer=SentenceTokenizer(language='portuguese')))

{'\npunkt knows that the periods in mr. smith and johann s. bach\ndo not mark sentence boundaries.': 1, '"(\'(and (this)) \'?)"': 1, '"(and the same with this one.)"': 1, '"it should be part of the\nprevious sentence."': 1, "('and this one!')": 1, '(how does it deal with this parenthesis?)': 1, '[(and this. )]': 1, 'and sometimes sentences\ncan start with non-capitalized words.': 1, 'good muffins cost $3.88\nin new york.': 1, 'i is a good variable\nname.': 1, 'please buy me\ntwo of them.': 1, 'thanks.': 1}


# Simple sentence and word count

In [17]:
sentence_count(text)

12

In [18]:
word_count(text)

75

# Corpus statistics

In [19]:
from mcr.nlp import statistics

In [20]:
statistics(text).to_frame().T\
    .style.format('{:,.0f}').format('{:.2f}', subset=['fill %', 'unique %'])\
    #.background_gradient(axis=0, cmap='RdYlGn') 

Unnamed: 0,rows,documents,fill %,unique documents,unique %,sentences,sentences / document,words,words / document,words / sentence,unique words,characters,characters / document,chars / sentence
0,1,1,100.0,1,100.0,12,12,75,75,6,56,435,435,36
