# NLP Pipeline
1. sentence tokenize
1. word tokenize
1. pos tag

In [1]:
import nltk
import re
corpus = [
    """
    This strategy has several advantages.
    it is very low memory scalable to large datasets as there is no need to store a vocabulary dictionary in memory
    it is fast to pickle and un-pickle as it holds no state besides the constructor parameters
    it can be used in a streaming (partial fit) or parallel pipeline as there is no state computed during fit.
    """,
    """
    It turns a collection of text documents into a scipy.sparse matrix holding token occurrence counts (or binary occurrence information), 
    possibly normalized as token frequencies if norm=’l1’ or projected on the euclidean unit sphere if norm=’l2’.
    """]
example = re.sub(r'\n *', ' ', corpus[0]).strip()
example

'This strategy has several advantages. it is very low memory scalable to large datasets as there is no need to store a vocabulary dictionary in memory it is fast to pickle and un-pickle as it holds no state besides the constructor parameters it can be used in a streaming (partial fit) or parallel pipeline as there is no state computed during fit.'

## Old school approach
Needs to store partial result in memory, not suitable for large datasets or streaming pipelines

In [2]:
sentences = nltk.sent_tokenize(example)
tokens = [nltk.word_tokenize(s) for s in sentences]
[nltk.tag.pos_tag(t) for t in tokens]

[[('This', 'DT'),
  ('strategy', 'NN'),
  ('has', 'VBZ'),
  ('several', 'JJ'),
  ('advantages', 'NNS'),
  ('.', '.')],
 [('it', 'PRP'),
  ('is', 'VBZ'),
  ('very', 'RB'),
  ('low', 'JJ'),
  ('memory', 'NN'),
  ('scalable', 'NN'),
  ('to', 'TO'),
  ('large', 'JJ'),
  ('datasets', 'NNS'),
  ('as', 'IN'),
  ('there', 'EX'),
  ('is', 'VBZ'),
  ('no', 'DT'),
  ('need', 'NN'),
  ('to', 'TO'),
  ('store', 'VB'),
  ('a', 'DT'),
  ('vocabulary', 'JJ'),
  ('dictionary', 'NN'),
  ('in', 'IN'),
  ('memory', 'NN'),
  ('it', 'PRP'),
  ('is', 'VBZ'),
  ('fast', 'JJ'),
  ('to', 'TO'),
  ('pickle', 'VB'),
  ('and', 'CC'),
  ('un-pickle', 'JJ'),
  ('as', 'IN'),
  ('it', 'PRP'),
  ('holds', 'VBZ'),
  ('no', 'DT'),
  ('state', 'NN'),
  ('besides', 'IN'),
  ('the', 'DT'),
  ('constructor', 'NN'),
  ('parameters', 'NNS'),
  ('it', 'PRP'),
  ('can', 'MD'),
  ('be', 'VB'),
  ('used', 'VBN'),
  ('in', 'IN'),
  ('a', 'DT'),
  ('streaming', 'NN'),
  ('(', '('),
  ('partial', 'JJ'),
  ('fit', 'NN'),
  (')', ')'),

## Decorators approach

In [3]:
def pipeline(func):
    '''pipeline decorator that calls function func(), next() and returns'''
    def start_pipeline(*args, **kwargs):
        iterator = func(*args, **kwargs)
        next(iterator)
        return iterator
    return start_pipeline

def ingest(corpus, target):
    for text in corpus:
        target.send(text)

Defining pipeline blocks

In [4]:
@pipeline
def tokenize_sentence(targets):
    while True:
        text = (yield)  # (yield) gets an item from an upstream step
        sentences = nltk.sent_tokenize(text)
        for sentence in sentences:
            for target in targets:
                target.send(sentence)  # send() sends data downstream

@pipeline
def tokenize_words(targets):
    while True:
        sentence = (yield)
        words = nltk.word_tokenize(sentence)
        for target in targets:
            target.send(words)

@pipeline
def pos_tagging(targets):
    while True:
        words = (yield)
        tagged_words = nltk.pos_tag(words)
        for target in targets:
            target.send(tagged_words)

@pipeline
def printline(line):
    while True:
        input = (yield)
        print('\n' + line)
        print(input)

@pipeline
def collect():
    while True:
        input = (yield)
        #myList += input

In [5]:
ingest(corpus, 
    tokenize_sentence([
        printline('Sentence:'),
        tokenize_words([
            printline('Word tokens:'),
            pos_tagging([printline('Tokens-Tags:'), collect()])
        ])
    ])
)


Sentence:

    This strategy has several advantages.

Word tokens:
['This', 'strategy', 'has', 'several', 'advantages', '.']

Tokens-Tags:
[('This', 'DT'), ('strategy', 'NN'), ('has', 'VBZ'), ('several', 'JJ'), ('advantages', 'NNS'), ('.', '.')]

Sentence:
it is very low memory scalable to large datasets as there is no need to store a vocabulary dictionary in memory
    it is fast to pickle and un-pickle as it holds no state besides the constructor parameters
    it can be used in a streaming (partial fit) or parallel pipeline as there is no state computed during fit.

Word tokens:
['it', 'is', 'very', 'low', 'memory', 'scalable', 'to', 'large', 'datasets', 'as', 'there', 'is', 'no', 'need', 'to', 'store', 'a', 'vocabulary', 'dictionary', 'in', 'memory', 'it', 'is', 'fast', 'to', 'pickle', 'and', 'un-pickle', 'as', 'it', 'holds', 'no', 'state', 'besides', 'the', 'constructor', 'parameters', 'it', 'can', 'be', 'used', 'in', 'a', 'streaming', '(', 'partial', 'fit', ')', 'or', 'parallel'