<a href="https://colab.research.google.com/github/naveenkarri383/NLP/blob/main/Text_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# NLP --> Common Problem statements
# Sentiment analysis, Language translation, Q and Answers , Text summerization, Named Entity Recognization, Sentence completion

In [5]:
 # Importing required libraries
 import nltk
 import string
 import re  # regular expressions tokenizer

In [7]:
# Text lower case  # It is required for the text to reduce the size of the vocabulary of our text data
def lowercase_text(text):
  return text.lower()

input_str="Weather is too cloudy. So, there is a possibility of rain !!!"
lowercase_text(input_str)

'weather is too cloudy. so, there is a possibility of rain !!!'

In [8]:
# Removeing number # We should either remove or convert those into textual representations. use regular expressions to remove the number
def remove_num(text):
  result=re.sub(r'\d+','',text)  # d+ is a pattern for any kind of number
  return result

input_str=" I bough 6 apples and 1kg of oranges in the market!!!"
remove_num(input_str)

' I bough  apples and kg of oranges in the market!!!'

In [18]:
# Convert numbers into words using ## Inflect library

import inflect
q=inflect.engine()

# Converting number to text word
def convert_num(text):
    # split strings into list of texts
    temp_str=text.split()
    # Initialize empty list and add to that
    new_str=[]

    for word in temp_str:
        if word.isdigit():   # if text is a digit then convert the digit into word and append into the new_str
            temp=q.number_to_words(word)   # inflect library to convert the digit to text
            new_str.append(temp)

      # append the text as it is
        else:
           new_str.append(word)
   # join the texts of new_str to form a string
    temp_str=''.join(new_str)
    return temp_str
input_str=" I bough 6 apples and 1kg of oranges in the market!!!"
convert_num(input_str)

'Iboughsixapplesand1kgoforangesinthemarket!!!'

In [23]:
# Removing punctuations
def remove_punct(text):
    translator=str.maketrans('','',string.punctuation)
    return text.translate(translator)

input_str="Hey there! I am using whatsapp!!!"
remove_punct(input_str)

'Hey there I am using whatsapp'

In [24]:
# Removig Stopwords
# Importing NLTK librarys
from nltk.corpus import stopwords   # Corpous is a data set
from nltk.tokenize import word_tokenize

nltk.download("stopwords")  # These are very specific to type of language
nltk.download("punkt")

# function to remove stopwords
def remove_stopwords(text):
    stop_words=set(stopwords.words("english"))  # stowords in english
    word_tokens=word_tokenize(text)  # break sentence into words/tokens
    filtered_text=[word for word in word_tokens if word not in stop_words]
    return filtered_text

input_text="AI can revolutionize the world. As AI develops the models becomes much faster"
remove_stopwords(input_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['AI',
 'revolutionize',
 'world',
 '.',
 'As',
 'AI',
 'develops',
 'models',
 'becomes',
 'much',
 'faster']

In [29]:
# Stemming --> Convert the words into base words  ex: Mango--> Mangoes , Going--> Go , Boys--> Boy
#Stemming always converts the words into their root words
from nltk.stem.porter import PorterStemmer   # other Stemming techniques are : Snowball , Lancaster
from nltk.tokenize import word_tokenize
stem1=PorterStemmer()

def stem_words(text):
    words_tokens=word_tokenize(text)
    stems=[stem1.stem(word) for word in words_tokens]
    return stems
text="AI can revolutionize the world. As AI develops the models becomes much faster"
stem_words(text)

['ai',
 'can',
 'revolution',
 'the',
 'world',
 '.',
 'as',
 'ai',
 'develop',
 'the',
 'model',
 'becom',
 'much',
 'faster']

In [36]:
# Lemmatization -->  Convert word into nearest possible words which can have same meaning. It can also chop off the words
from nltk.stem import wordnet
from nltk.tokenize import word_tokenize
lemma=wordnet.WordNetLemmatizer()
nltk.download("wordnet")

# Lemmatization of a string
def lemmatize_word(text):
    word_tokens=word_tokenize(text)

    lemmas=[lemma.lemmatize(word,pos='v')for word in word_tokens]   # Parts of speech tagging to understand the text context
    return lemmas

text="AI can revolutionize the world. As AI develops the models becomes much faster"
lemmatize_word(text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['AI',
 'can',
 'revolutionize',
 'the',
 'world',
 '.',
 'As',
 'AI',
 'develop',
 'the',
 'model',
 'become',
 'much',
 'faster']

In [43]:
# Parts of speech tagging
from nltk import pos_tag
from nltk.tokenize import word_tokenize
nltk.download("averaged_perceptron_tagger")

# convert text into words
def pos_tagger(text):
    word_tokens=word_tokenize(text)
    return pos_tag(word_tokens)

text="AI can revolutionize the world. As AI develops the models becomes much faster"
pos_tagger(text)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('AI', 'NNP'),
 ('can', 'MD'),
 ('revolutionize', 'VB'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('.', '.'),
 ('As', 'IN'),
 ('AI', 'NNP'),
 ('develops', 'VBZ'),
 ('the', 'DT'),
 ('models', 'NNS'),
 ('becomes', 'RB'),
 ('much', 'JJ'),
 ('faster', 'RBR')]

In [53]:
# Chunking --> Extracting phrases from the unstructured text and give them more structure to it.
#We also call them shallow parsing. We can also do pos tagging on top of it.
from nltk.tokenize import word_tokenize
from nltk import pos_tag

def chunking(text,grammer):
    word_tokens=word_tokenize(text)

    #label the word with pos taggging
    word_pos=pos_tag(word_tokens)

    # create a chunk parser using grammer
    chunkParser= nltk.RegexpParser(grammer)

    # test it on the list of word tokens with tagged pos
    tree=chunkParser.parse(word_pos)

    for subtree in tree.subtrees():
        print(subtree)

    # tree draw()

sentence="The little red parrot is flying in the sky"
grammer=  "NP: {<DT>?<JJ>*<NN>}"     # regular expression
chunking(sentence,grammer)



(S
  (NP The/DT little/JJ red/JJ parrot/NN)
  is/VBZ
  flying/VBG
  in/IN
  (NP the/DT sky/NN))
(NP The/DT little/JJ red/JJ parrot/NN)
(NP the/DT sky/NN)


In [55]:
# Named Entity Recognization--> It is used to extract information from unstructured text. It is used to classify the entities which is present in the text into
# categories like persons, events, organizations etc..
# It also gives the idea about the relationship between the entities

from nltk.tokenize import word_tokenize
from nltk import pos_tag,ne_chunk
nltk.download("maxent_ne_chunker")
nltk.download("words")

def ner(text):
    word_tokens=word_tokenize(text)

    word_pos=pos_tag(word_tokens)

    print(ne_chunk(word_pos))

text=  "Porter stemmer. The Porter stemming algorithm is a process for removing suffixes from words in English. Removing suffixes. automatically is an operation which is especially useful in the field of information retrieval."
ner(text)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


(S
  (GPE Porter/NNP)
  stemmer/NN
  ./.
  The/DT
  Porter/NNP
  stemming/VBG
  algorithm/NN
  is/VBZ
  a/DT
  process/NN
  for/IN
  removing/VBG
  suffixes/NNS
  from/IN
  words/NNS
  in/IN
  (GPE English/NNP)
  ./.
  Removing/VBG
  suffixes/NNS
  ./.
  automatically/RB
  is/VBZ
  an/DT
  operation/NN
  which/WDT
  is/VBZ
  especially/RB
  useful/JJ
  in/IN
  the/DT
  field/NN
  of/IN
  information/NN
  retrieval/NN
  ./.)
