# Integrating normalization with tokenization

## Structure

In [1]:
def tokenize(input):
    pass

def normalize(input):
    pass

sample = "Hello, Mom!"
tokens = tokenize(sample)
# print(tokens)
normalized = [normalize(token) for token in tokens]
print(normalized)

TypeError: 'NoneType' object is not iterable

## Examples

### Tokenize on white space and normalize as lower case

In [1]:
def tokenize(input): # tokenize on white space
    return input.split()

def normalize(input): # normalize as lower case
    return (input, input.lower())

sample = "Hello, Mom!"
tokens = tokenize(sample)
# print(tokens)
normalized = [normalize(token) for token in tokens]
print(normalized)

[('Hello,', 'hello,'), ('Mom!', 'mom!')]


### Use NLTK word tokenization and normalize as POS

In [7]:
import nltk

def tokenize(input): # use NLTK word tokenization
    return nltk.word_tokenize(input)

def normalize(input): # normalize as POS
    pos = nltk.pos_tag([input]) # since it’s a single word, make it a list
    # print(pos)
    return (pos)

sample = "Hello, Mom!"
tokens = tokenize(sample)
# print(tokens)
normalized = [normalize(token) for token in tokens]
print(normalized)

[[('Hello', 'NN')], [(',', ',')], [('Mom', 'NN')], [('!', '.')]]


### Use NLTK word tokenization and strip vowels and punctuation to normalize

In [21]:
import re

def tokenize(input): # use NLTK word tokenization
    return nltk.word_tokenize(input)

def normalize(input): # normalize as POS + strip vowels + lowercase
    pos = nltk.pos_tag([input]) # since it’s a single word, make it a list
    #print('POS: ', pos, 'word: ', pos[0][0])
    return input, pos[0][1], re.sub('[aeiou]','',input.lower())

sample = "Hello, Mom!"
tokens = tokenize(sample)
# print(tokens)
normalized = [normalize(token) for token in tokens]
print(normalized)

[('Hello', 'NN', 'hll'), (',', ',', ','), ('Mom', 'NN', 'mm'), ('!', '.', '!')]
