# Pre-processing in NLP

In [1]:
import numpy as np
import pandas as pd
import nltk

## Tokenize words and sentenses

In [20]:
from nltk.tokenize import sent_tokenize, word_tokenize

data = "All work and no play makes jack a dull boy, all work and no play"
print(word_tokenize(data))

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


In [21]:
data = "All work and no play, makes jack dull boy. All work and? no play makes jack a dull boy."
print(sent_tokenize(data))

['All work and no play, makes jack dull boy.', 'All work and?', 'no play makes jack a dull boy.']


## Convert text to lower cases

In [22]:
input_txt = "How Are you?"

In [23]:
from nltk.tokenize import word_tokenize
output_txt = [w.lower() for w in word_tokenize(input_txt)]

In [24]:
print(output_txt)

['how', 'are', 'you', '?']


## Remove Punctuation

In [15]:
input_txt = "How Are you?"

### Method 1

In [16]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
output_txt = tokenizer.tokenize(input_txt)

In [17]:
print(output_txt)

['How', 'Are', 'you']


### Method 2

In [25]:
from string import punctuation
output_txt = [word for word in word_tokenize(input_txt) if word not in punctuation]

In [26]:
print(output_txt)

['How', 'Are', 'you']


## Remove numbers

In [27]:
input_txt = "How1 Are 2 you love 4?"

In [44]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[^0-9\s]+')
output_txt = tokenizer.tokenize(input_txt)

In [45]:
print(output_txt)

['How', 'Are', 'you', 'love', '?']


## Keep Letters Only

In [46]:
input_txt = "How1 Are 2 you love 4?"

In [47]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
output_txt = tokenizer.tokenize(input_txt)

In [48]:
print(output_txt)

['How', 'Are', 'you', 'love']


## Remove Tages

In [50]:
import re
input_txt = """<head><body>hello world!</body></head>"""
output_txt = re.sub('<[^<]+?>','', input_txt)
print (output_txt)

hello world!


## Stemming and Lemmatization are Text Normalization 
(or sometimes called Word Normalization)

In [64]:
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Yu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Stemming
It does not keep a lookup table for actual stems of the word but applies algorithmic rules to generate stems. It uses the rules to decide whether it is wise to strip a suffix.

In [65]:
import nltk
from nltk.stem import SnowballStemmer
#is based on The Porter Stemming Algorithm
snowball_stemmer = SnowballStemmer('english')
input_txt = "This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit"
output_txt = [snowball_stemmer.stem(word) for word in nltk.word_tokenize(input_txt)]
print (output_txt)

['this', 'is', 'a', 'demo', 'text', 'for', 'nlp', 'use', 'nltk', '.', 'full', 'form', 'of', 'nltk', 'is', 'natur', 'languag', 'toolkit']


### Lemmatize
Lemmatization is similar to stemming but it brings context to the words.So it links words with similar meaning to one word.lemmatization does morphological analysis of the words. In short, lemmatize the text so as to get its root form eg: functions,funtionality as function

In [66]:
from nltk.stem import WordNetLemmatizer
#is based on The Porter Stemming Algorithm
wordnet_lemmatizer = WordNetLemmatizer()
input_txt = "one glass, two glasses"
output_txt = [wordnet_lemmatizer.lemmatize(word) for word in nltk.word_tokenize(input_txt)]
print (output_txt)

['one', 'glass', ',', 'two', 'glass']


## Stop words removal
Remove irrelevant words using nltk stop words like “is,the,a” etc from the sentences as they don’t carry any information.

In [69]:
import nltk
from nltk.corpus import stopwords
stopword = stopwords.words('english')
input_txt = "This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit"
output_txt = [word for word in nltk.word_tokenize(input_txt) if word not in stopword]
print (output_txt)

['This', 'Demo', 'Text', 'NLP', 'using', 'NLTK', '.', 'Full', 'form', 'NLTK', 'Natural', 'Language', 'Toolkit']


## Contraction
Contraction helps to expand the word form like “ain’t”: “am not”. Contractions file has been created in my github which we will be importing to use it.

In [84]:
# pip install contractions

In [75]:
# coding: utf-8
import re
import nltk
from contractions import contractions_dict

def expand_contractions(text, contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
            if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def main():
    text = """I ain't going there. You'll have to go alone."""
    
    text=expand_contractions(text,contractions_dict)
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    
    print (tokenized_sentences)

if __name__ == '__main__':
    main()

[['I', 'are', 'not', 'going', 'there', '.'], ['you', 'will', 'have', 'to', 'go', 'alone', '.']]


## Spell Check
correct the incorrect spelled words like “wrld” to “world”

In [83]:
#pip install autocorrect

In [82]:
from autocorrect import Speller
spell = Speller(lang='en')
input_txt = "This is a wrld of hope"
output_txt = [spell(w) for w in (nltk.word_tokenize(input_txt))]
print (output_txt)

['This', 'is', 'a', 'world', 'of', 'hope']
