# Normalization

Normalization is the task of converting a document into a single canonical form

## Import required packages

In [8]:
import re

from nltk.corpus import stopwords

### Load stopwords as provided by NLTK

In [9]:
nltk_stopwords = list(stopwords.words('english'))

In [10]:
print(nltk_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Define a document

We first create list of sentences, and we make it particularly "ugly".

In [11]:
sentences = ["Text processing with Python    is great hahahaha.", 
             "It     isn't (very) cooomplicated to get started.",
             "However,careful to...you know....avoid mistakes.",
             "Contact me at vonderweth@nus.edu.sg; see http://nus.edu.sg.",
             "This is so cooool #nltkrocks     :))) :-P <3."]

## Normalize raw sentences

#### Lowercase all letters

In [12]:
for s in sentences:
    print(s.lower())

text processing with python    is great hahahaha.
it     isn't (very) cooomplicated to get started.
however,careful to...you know....avoid mistakes.
contact me at vonderweth@nus.edu.sg; see http://nus.edu.sg.
this is so cooool #nltkrocks     :))) :-p <3.


#### Remove duplicate whitespaces

In [13]:
for s in sentences:
    print(re.sub(r'\s+', ' ', s))
    

Text processing with Python is great hahahaha.
It isn't (very) cooomplicated to get started.
However,careful to...you know....avoid mistakes.
Contact me at vonderweth@nus.edu.sg; see http://nus.edu.sg.
This is so cooool #nltkrocks :))) :-P <3.


#### Remove ellipses (...)

In [14]:
for s in sentences:
    print(re.sub(r'([.]){2,}', ' ', s))

Text processing with Python    is great hahahaha.
It     isn't (very) cooomplicated to get started.
However,careful to you know avoid mistakes.
Contact me at vonderweth@nus.edu.sg; see http://nus.edu.sg.
This is so cooool #nltkrocks     :))) :-P <3.


#### Wrapping it all together (with additional steps)

In [15]:
for s in sentences:
    s = s.lower()                                                   # Lowercase whole sentence
    s = re.sub(r'\s+', ' ', s)                                      # Remove duplicate whitespaces
    s = re.sub(r'([.]){2,}', ' ', s)                                # Remove ellipses ...
    s = re.sub(r'([\w.-]+)([,;])([\w.-]+)', '\g<1>\g<2> \g<3>', s)  # Add missing whitespace after , and ;
    s = re.sub(r'(.+)\1{2,}', '\g<1>\g<1>', s)                      # Reduce repeated sequences to 2
    s = re.sub(r'[\(\[].*?[\)\]]', ' ', s)                          # Remove all words in brackets
    s = re.sub(r'(n\'t)', ' not', s)                                # Resolve contraction "-n't"
    #s = re.sub(r'[^a-zA-Z0-9_-]+', ' ', s)                         # Remove all "odd" characters
    s = re.sub(r'\s+', ' ', s)                                      # Remove duplicate whitespaces (again)
    s = s.strip()                                                   # Remove trailing whitespaces
    print(s)

text processing with python is great haha.
it is not coomplicated to get started.
however, careful to you know avoid mistakes.
contact me at vonderweth@nus.edu.sg; see http://nus.edu.sg.
this is so cool #nltkrocks :)) :-p <3.


## Normalize token lists

We use the TweetTokenizer since we have many informal tokens such as email addresses, URLs, and emoticons

In [16]:
from nltk.tokenize import TweetTokenizer

In [17]:
tweet_tokenizer = TweetTokenizer()

### Tokenize all sentences

In [18]:
token_lists = [ tweet_tokenizer.tokenize(s) for s in sentences ]

for tl in token_lists:
    print(tl)

['Text', 'processing', 'with', 'Python', 'is', 'great', 'hahahaha', '.']
['It', "isn't", '(', 'very', ')', 'cooomplicated', 'to', 'get', 'started', '.']
['However', ',', 'careful', 'to', '...', 'you', 'know', '...', 'avoid', 'mistakes', '.']
['Contact', 'me', 'at', 'vonderweth@nus.edu.sg', ';', 'see', 'http://nus.edu.sg', '.']
['This', 'is', 'so', 'cooool', '#nltkrocks', ':)', ')', ')', ':-P', '<3', '.']


In [19]:
print(token_lists)

[['Text', 'processing', 'with', 'Python', 'is', 'great', 'hahahaha', '.'], ['It', "isn't", '(', 'very', ')', 'cooomplicated', 'to', 'get', 'started', '.'], ['However', ',', 'careful', 'to', '...', 'you', 'know', '...', 'avoid', 'mistakes', '.'], ['Contact', 'me', 'at', 'vonderweth@nus.edu.sg', ';', 'see', 'http://nus.edu.sg', '.'], ['This', 'is', 'so', 'cooool', '#nltkrocks', ':)', ')', ')', ':-P', '<3', '.']]


#### Lowercase all letters

In [20]:
for tl in token_lists:
    print([token.lower() for token in tl])

['text', 'processing', 'with', 'python', 'is', 'great', 'hahahaha', '.']
['it', "isn't", '(', 'very', ')', 'cooomplicated', 'to', 'get', 'started', '.']
['however', ',', 'careful', 'to', '...', 'you', 'know', '...', 'avoid', 'mistakes', '.']
['contact', 'me', 'at', 'vonderweth@nus.edu.sg', ';', 'see', 'http://nus.edu.sg', '.']
['this', 'is', 'so', 'cooool', '#nltkrocks', ':)', ')', ')', ':-p', '<3', '.']


#### Remove duplicate whitespaces

Well, not needed anymore. The tokenizer already took care of it

#### Remove ellipses (...)

This translates to removing all tokens of the form ".." ot with even more consecutive dots

In [21]:
for tl in token_lists:
    print([token for token in tl if not token.startswith("..")])

['Text', 'processing', 'with', 'Python', 'is', 'great', 'hahahaha', '.']
['It', "isn't", '(', 'very', ')', 'cooomplicated', 'to', 'get', 'started', '.']
['However', ',', 'careful', 'to', 'you', 'know', 'avoid', 'mistakes', '.']
['Contact', 'me', 'at', 'vonderweth@nus.edu.sg', ';', 'see', 'http://nus.edu.sg', '.']
['This', 'is', 'so', 'cooool', '#nltkrocks', ':)', ')', ')', ':-P', '<3', '.']


#### Wrapping it all together (with additional steps)

We are trying the achieve the same results as above.

In [83]:
for i, tl in enumerate(token_lists):
    for j, token in enumerate(tl):
        s = token
        s = s.lower()
        s = re.sub(r'([.]){2,}', ' ', s)                                # Remove ellipses ...
        s = re.sub(r'([\w.-]+)([,;])([\w.-]+)', '\g<1>\g<2> \g<3>', s)  # Add missing whitespace after , and ;
        s = re.sub(r'(.+)\1{2,}', '\g<1>\g<1>', s)                      # Reduce repeated sequences to 2
        s = re.sub(r'[\(\[].*?[\)\]]', ' ', s)                          # Remove all words in brackets
        s = re.sub(r'(n\'t)', ' not', s)                                # Resolve contraction "-n't"
        s = re.sub(r'\s+', ' ', s)                                      # Remove duplicate whitespaces (again)
        s = s.strip()                                                   # Remove trailing whitespaces
        tl[j] = s
    
    token_lists[i] = [ token for token in tl if token.strip() != '' ]
    print(token_lists[i])

['text', 'processing', 'python', 'great', 'haha', '.']
['is not', 'coomplicated', 'get', 'started', '.']
['however', ',', 'careful', 'know', 'avoid', 'mistakes', '.']
['contact', 'vonderweth@nus.edu.sg', ';', 'see', 'http://nus.edu.sg', '.']
['cool', '#nltkrocks', ':)', ')', ')', ':-p', '<3', '.']


Note how the conversion from "isn't" to "is not" makes the result no longer perfectly tokenized

In [89]:
for i, tl in enumerate(token_lists):
    new_tl = []
    for j, token in enumerate(tl):
        token_tl = tweet_tokenizer.tokenize(token)
        new_tl.extend(token_tl)
    token_lists[i] = new_tl
    print(token_lists[i])

['text', 'processing', 'python', 'great', 'haha', '.']
['is', 'not', 'coomplicated', 'get', 'started', '.']
['however', ',', 'careful', 'know', 'avoid', 'mistakes', '.']
['contact', 'vonderweth@nus.edu.sg', ';', 'see', 'http://nus.edu.sg', '.']
['cool', '#nltkrocks', ':)', ')', ')', ':-p', '<3', '.']


Also the removal of all words in brackets failed. The problem is that the tokenizer has split the brackets from the words. As a consequence, just by looking at a word, we cannot decide if it was enclosed in brackets or not. However, we can still remove bracketed words "manually", by checking each token/word if the preceding and succeeding token is a open and closing bracked, respectively.

In [90]:
for i, tl in enumerate(token_lists):
    for j, token in enumerate(tl):
        s = token
        try:
            if tl[j-1] in "[(" and tl[j+1] in ")]":
                s = ''
                tl[j-1] = ''
                tl[j+1] = ''
        except:
            pass
        tl[j] = s
        
    token_lists[i] = [ token for token in tl if token.strip() != '' ]
    print(token_lists[i])

['text', 'processing', 'python', 'great', 'haha', '.']
['is', 'not', 'coomplicated', 'get', 'started', '.']
['however', ',', 'careful', 'know', 'avoid', 'mistakes', '.']
['contact', 'vonderweth@nus.edu.sg', ';', 'see', 'http://nus.edu.sg', '.']
['cool', '#nltkrocks', ':)', ')', ')', ':-p', '<3', '.']


### Remove stopwords

In [91]:
for i, tl in enumerate(token_lists):
    for j, token in enumerate(tl):
        if token in nltk_stopwords:
            token = ''
        tl[j] = token
    token_lists[i] = [ token for token in tl if token.strip() != '' ]

In [92]:
print(token_lists)

[['text', 'processing', 'python', 'great', 'haha', '.'], ['coomplicated', 'get', 'started', '.'], ['however', ',', 'careful', 'know', 'avoid', 'mistakes', '.'], ['contact', 'vonderweth@nus.edu.sg', ';', 'see', 'http://nus.edu.sg', '.'], ['cool', '#nltkrocks', ':)', ')', ')', ':-p', '<3', '.']]
