# Eliminating Punctuation

In [10]:
import re, string
from nltk.tokenize import word_tokenize, sent_tokenize

In [5]:
text = "Hey! This is just a random paragraph. Normalization involves eliminating punctuation, converting the entire text into lowercase or uppercase, converting numbers into words, expanding abbreviations, canonicalization of text, and so on."

In [19]:
sentences = sent_tokenize(text)
words = [word_tokenize(sentence) for sentence in sentences]

In [44]:
normalized = []

In [45]:
x = re.compile("[%s]" % re.escape(string.punctuation))
for word in words:
    new_word = []
    for token in word:
        new_token = x.sub(u'', token)
        if not new_token == u'':
            new_word.append(new_token)
    normalized.append(new_word)

In [46]:
print(normalized)

[['Hey'], ['This', 'is', 'just', 'a', 'random', 'paragraph'], ['Normalization', 'involves', 'eliminating', 'punctuation', 'converting', 'the', 'entire', 'text', 'into', 'lowercase', 'or', 'uppercase', 'converting', 'numbers', 'into', 'words', 'expanding', 'abbreviations', 'canonicalization', 'of', 'text', 'and', 'so', 'on']]


# Converting into lowercase or uppercase

In [47]:
upper = "UPPERCASE"
lower = "lowercase"
print(upper.lower(), lower.upper(), sep="\n")

uppercase
LOWERCASE


# Removing stop words

In [69]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stops = set(stopwords.words('english'))
text = "This text may contain many stop words which need to be filtered out for the process of normalization. This is also a sentence."
sentences = sent_tokenize(text)
words = [word_tokenize(sentence) for sentence in sentences]
normalized = []
for word in words:
    temp = []
    for token in word:
        if token not in stops:
            temp.append(token)
    normalized.append(temp)
print(normalized)

[['This', 'text', 'may', 'contain', 'many', 'stop', 'words', 'need', 'filtered', 'process', 'normalization', '.'], ['This', 'also', 'sentence', '.']]


# Word replacement

In [70]:
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]

In [78]:
class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            (s, count) = re.subn(pattern, repl, s)
        return s

In [79]:
regexpreplacer = RegexpReplacer()
regexpreplacer.replace("Hey! This isn't normalized text.")

'Hey! This is not normalized text.'

# Dealing with repeating characters

In [91]:
from nltk.corpus import wordnet
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'

    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

In [95]:
repeatreplacer = RepeatReplacer()
print(repeatreplacer.replace("hello"), repeatreplacer.replace("hiiii"), sep="\n")

hello
hi


# Replacing a word with its synonym

In [1]:
class WordReplacer(object):
    def __init__(self, word_map):
        self.word_map = word_map
        
    def replace(self, word):
        return self.word_map.get(word, word)

In [2]:
wordreplacer = WordReplacer({'congrats':'congratulations'})
wordreplacer.replace("congrats")

'congratulations'