# Start

In [1]:
import math
import pandas as pd
import numpy as np
import re
from matplotlib import pyplot as plt
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.tokenize.punkt import PunktSentenceTokenizer # (sent_tokenize)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer

SKLEARN_WORD_TOKENIZER_REGEX = r'(?u)\b\w\w+\b'  # SKLEARN's default selects 2+ tokens
CUSTOM_WORD_TOKENIZER_REGEX = r'(?u)\b\w+\b'  # select 1+ tokens

# End of sentence pattern matches an optional not-word symbol, any spacing symbols, optional \r, one more \n, any spacing symbols and any non-word symbols
eos_pattern = re.compile(r'(?u)\W?\s*\r?\n+\s*' + r'\W*')
# End of sentence punctuation pattern matches one ore more "." or "!" or "?" ate the end of the string
eosp_pattern = re.compile(r'(?u)\.+$|!+$|\?+$')

# custom_sentence_tokenizer()

In [48]:
def custom_sentence_tokenizer(document):
    # WARNING: This custom sentence tokenizer destroys the original word capitalizing (to lower case) and removes the
    # document's final period because of the sucessive sentence breakdown tricks
    # TODO: is it possible to preserve the case of acronyms?  I still don't think so...

    # Let PunktSentenceTokenizer breakdown sentences after replacing new lines with periods
    sentences = PunktSentenceTokenizer().tokenize(eos_pattern.sub('. ', document))
    # Removing final periods to avoid same sentences with or without final periods
    sentences = [eosp_pattern.sub('', sentence) for sentence in sentences]

    # Let PunktSentenceTokenizer breakdown sentences AGAIN by capitalizing words in the previous sentence tokens
    # sentences = [PunktSentenceTokenizer().tokenize(' '.join(word.capitalize() for word in sentence.split())) for sentence in sentences]
    # Lower case sentence to avoid non-original capitalized words but destroying the original word capitalizing.
    # sentences = [sentence.lower() for sub_sentence in sentences for sentence in sub_sentence]
    # Removing final periods AGAIN to avoid same sentences with or without final periods
    # sentences = [eosp_pattern.sub('', sentence) for sentence in sentences]
    return sentences

## debugging

In [49]:
document = 'one\n. one two.\n one two three\n one two three four.'
document = 'Mr. 4\n. 3 4.\n 2 3 4\n 1 2 3 4...'
# End of sentence pattern matches an optional not-word symbol, any spacing symbols, optional \r, one more \n, any spacing symbols and any non-word symbols
eos_pattern = re.compile(r'(?u)\W?\s*\r?\n+\s*' + r'\W*')
display(document)
display(eos_pattern.findall(document))
eos_pattern.sub('. ', document)

'Mr. 4\n. 3 4.\n 2 3 4\n 1 2 3 4...'

['\n. ', '.\n ', '\n ']

'Mr. 4. 3 4. 2 3 4. 1 2 3 4...'

In [50]:
sentences = PunktSentenceTokenizer().tokenize(eos_pattern.sub('. ', document)); sentences

['Mr.', '4.', '3 4.', '2 3 4.', '1 2 3 4...']

In [51]:
sentences = [eosp_pattern.sub('', sentence) for sentence in sentences]; sentences

['Mr', '4', '3 4', '2 3 4', '1 2 3 4']

In [52]:
# sentences = [PunktSentenceTokenizer().tokenize(' '.join(word.capitalize() for word in sentence.split())) for sentence in sentences]; sentences

In [53]:
# sentences = [sentence.lower() for sub_sentence in sentences for sentence in sub_sentence]; sentences

In [55]:
# sentences = [eosp_pattern.sub('', sentence) for sentence in sentences]
sentences

['Mr', '4', '3 4', '2 3 4', '1 2 3 4']

## testing

In [56]:
custom_sentence_tokenizer(document)

['Mr', '4', '3 4', '2 3 4', '1 2 3 4']

In [72]:
from sklearn.feature_extraction.text import CountVectorizer
import sys
sys.path.insert(1, '../../machine-learning')
sys.path.insert(1, '../../homewise/ds-core/')
#from nlp import custom_word_tokenizer, custom_sentence_tokenizer
vec = CountVectorizer(tokenizer=custom_sentence_tokenizer, token_pattern='(?u)\\b\\w+\\b', lowercase = False, ngram_range=(1,1))
tdm = vec.fit_transform([document])
display(pd.DataFrame(tdm.toarray(), columns=vec.get_feature_names_out()))

Unnamed: 0,1 2 3 4,2 3 4,3 4,4,Mr
0,1,1,1,1,1


In [83]:
vec = CountVectorizer(tokenizer=custom_sentence_tokenizer, token_pattern='(?u)\\b\\w+\\b', lowercase = False, ngram_range=(1,5))
tdm = vec.fit_transform([document])
display(pd.DataFrame(tdm.toarray(), columns=vec.get_feature_names_out()))

Unnamed: 0,1 2 3 4,2 3 4,2 3 4 1 2 3 4,3 4,3 4 2 3 4,3 4 2 3 4 1 2 3 4,4,4 3 4,4 3 4 2 3 4,4 3 4 2 3 4 1 2 3 4,Mr,Mr 4,Mr 4 3 4,Mr 4 3 4 2 3 4,Mr 4 3 4 2 3 4 1 2 3 4
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [84]:
from nltk.tokenize import sent_tokenize
vec = CountVectorizer(tokenizer=sent_tokenize, token_pattern='(?u)\\b\\w+\\b', lowercase = False, ngram_range=(1,5))
tdm = vec.fit_transform([document])
display(pd.DataFrame(tdm.toarray(), columns=vec.get_feature_names_out()))

Unnamed: 0,2 3 4\n 1 2 3 4...,3 4.,3 4. 2 3 4\n 1 2 3 4...,Mr. 4\n.,Mr. 4\n. 3 4.,Mr. 4\n. 3 4. 2 3 4\n 1 2 3 4...
0,1,1,1,1,1,1


# custom_word_tokenizer()

In [95]:
def custom_word_tokenizer(document):
    return [token for sentence in custom_sentence_tokenizer(document)
            for token in RegexpTokenizer(CUSTOM_WORD_TOKENIZER_REGEX).tokenize(sentence)]

## debugging

In [96]:
[sentence for sentence in custom_sentence_tokenizer(document)]

['Mr', '4', '3 4', '2 3 4', '1 2 3 4']

In [100]:
[sentence for sentence in PunktSentenceTokenizer().tokenize(document)]

['Mr.', '4\n.', '3 4.', '2 3 4\n 1 2 3 4...']

In [97]:
[token for sentence in custom_sentence_tokenizer(document)
            for token in RegexpTokenizer(CUSTOM_WORD_TOKENIZER_REGEX).tokenize(sentence)]

['Mr', '4', '3', '4', '2', '3', '4', '1', '2', '3', '4']

## testing

In [98]:
custom_word_tokenizer(document)

['Mr', '4', '3', '4', '2', '3', '4', '1', '2', '3', '4']

In [99]:
vec = CountVectorizer(tokenizer=custom_word_tokenizer, lowercase = False, ngram_range=(1,1))
tdm = vec.fit_transform([document])
display(pd.DataFrame(tdm.toarray(), columns=vec.get_feature_names_out()))

Unnamed: 0,1,2,3,4,Mr
0,1,2,3,4,1


In [92]:
vec = CountVectorizer(tokenizer=custom_word_tokenizer, lowercase = False, ngram_range=(1,5))
tdm = vec.fit_transform([document])
display(pd.DataFrame(tdm.toarray(), columns=vec.get_feature_names_out()))

Unnamed: 0,1,1 2,1 2 3,1 2 3 4,2,2 3,2 3 4,2 3 4 1,2 3 4 1 2,3,...,4 2 3 4 1,4 3,4 3 4,4 3 4 2,4 3 4 2 3,Mr,Mr 4,Mr 4 3,Mr 4 3 4,Mr 4 3 4 2
0,1,1,1,1,2,2,2,1,1,3,...,1,1,1,1,1,1,1,1,1,1


In [93]:
from nltk.tokenize import word_tokenize
vec = CountVectorizer(tokenizer=word_tokenize, token_pattern='(?u)\\b\\w+\\b', lowercase = False, ngram_range=(1,5))
tdm = vec.fit_transform([document])
display(pd.DataFrame(tdm.toarray(), columns=vec.get_feature_names_out()))

Unnamed: 0,.,. 2,. 2 3,. 2 3 4,. 2 3 4 1,. 3,. 3 4,. 3 4 .,. 3 4 . 2,...,....1,4 ...,4 1,4 1 2,4 1 2 3,4 1 2 3 4,Mr.,Mr. 4,Mr. 4 .,Mr. 4 . 3,Mr. 4 . 3 4
0,2,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
