#### Text Processing using NLTK(Natural Language Tool Kit)

In [1]:
# importing packages

import nltk
import string
import re

#### To lower strings

In [2]:
# converting string to lower case

input_str = "Today is best day to start something New.Actually everyday is best if You have passion to learn anyThing neW"

input_str.lower()

'today is best day to start something new.actually everyday is best if you have passion to learn anything new'

#### Remove Numbers

In [3]:
def remove_number(text):
    result = re.sub(r'\d+','',text)
    return result
    
text = 'today i have scored 5 out of 10 and i do not want anyone to see this'
remove_number(text)

'today i have scored  out of  and i do not want anyone to see this'

#### Convert digits to words

In [4]:
# to convert digits to words we use inflect library i.e 6 as six

import inflect
a = inflect.engine()

def digits_to_words(text):
    temp_string = text.split()
    new_str = []
    
    for word in temp_string:
        if word.isdigit():
            temp = a.number_to_words(word)
            new_str.append(temp)
        else:
            new_str.append(word)
            
    temp_string = ' '.join(new_str)
    return temp_string

digits_to_words(text)

'today i have scored five out of ten and i do not want anyone to see this'

#### Remove Punctuations

In [5]:
# remove punctutions from text

def remove_pun(text):
    translator = str.maketrans('','',string.punctuation)
    return text.translate(translator)

text = "Are you okay ? Or do you need sometime ?!!!!!"
remove_pun(text)

'Are you okay  Or do you need sometime '

#### Removing Stopwords
stopwords are the words that do not make much contribution to the model like or,a,and,the etc.

In [6]:
# importing libraries for stopwords

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_token = word_tokenize(text)
    filtered_text = [word for word in word_token if word not in stop_words]
    return filtered_text

text = "A.I is the new boom in the I.T industries"
remove_stopwords(text)

[nltk_data] Downloading package stopwords to /Users/nick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/nick/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['A.I', 'new', 'boom', 'I.T', 'industries']

#### Stemming
Stemming is used to convert words to their base/root words i.e. mangoes -> mango,going -> go etc.

In [8]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stem1 = PorterStemmer()

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stem1.stem(word) for word in word_tokens]
    return stems

text = "I am going in was the famous gaming company that went to world best gaming."
print(stem_words(text))

['I', 'am', 'go', 'in', 'wa', 'the', 'famou', 'game', 'compani', 'that', 'went', 'to', 'world', 'best', 'game', '.']


#### Lemmitization
Lemmitization basically try to find next possible words with similar kind of meaning.

In [9]:
from nltk.stem import wordnet
from nltk.tokenize import word_tokenize
lemma = wordnet.WordNetLemmatizer()
nltk.download('wordnet')

def lemmatize_words(text):
    word_token = word_tokenize(text)
    lemmas = [lemma.lemmatize(word,pos = 'v') for word in word_token]
    return lemmas


text = 'This is testing for the function built by own'
lemmatize_words(text)

[nltk_data] Downloading package wordnet to /Users/nick/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


['This', 'be', 'test', 'for', 'the', 'function', 'build', 'by', 'own']

#### POS as in Parts of Speech Tagging

In [None]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

def pos_tag(text):
    word_token = word_tokenize(text)
    return pos_tag(word_token)

text = "Are you afraid of unstructure dataset?"
pos_tag(text)

#### Chunking
Chuncking is the process of extracting phrases from structred data and group them into more structured.

In [16]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

def chunking(text,grammar):
    word_tokens = word_tokenize(text)
    
    word_pos = pos_tag(word_tokens)
    
    chunkparser = nltk.RegexpParser(grammar)
    
    tree = chunkparser.parse(word_pos)
    
    for subtree in tree.subtrees():
        print(subtree)
    #tree.draw()
        
text = "This is a person with good skills"
grammar = "NP:{<DT>?<JJ>*<NN>}"
chunking(text,grammar)

(S This/DT is/VBZ (NP a/DT person/NN) with/IN good/JJ skills/NNS)
(NP a/DT person/NN)


#### NER as in Named Entity Recognisation
It is basically used to recognise whether we are using person or object with respect to context.

In [20]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag,ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')

def ner(text):
    word_tokens = word_tokenize(text)
    word_pos = pos_tag(word_tokens)
    print(ne_chunk(word_pos))
    
text = "Brain Lara from WI scored the highest 400 runs against England"
ner(text)

(S
  (PERSON Brain/NNP)
  (ORGANIZATION Lara/NNP)
  from/IN
  (ORGANIZATION WI/NNP)
  scored/VBD
  the/DT
  highest/JJS
  400/CD
  runs/NNS
  against/IN
  (GPE England/NNP))


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/nick/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/nick/nltk_data...
[nltk_data]   Package words is already up-to-date!
