# Text Analytics
1. Extract Sample document and apply following document preprocessing
methods:Tokenization, POS Tagging, stop words removal, Stemming andLemmatization.
2. Create representation of document by calculating Term Frequency and InverseDocumentFrequency.

In [11]:
import nltk

text = "Dr. Piyush is learning NLP. It is very interesting and exciting. It is an important area of AI."

[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Tokenization
Tokenization is the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements. The tokens become the input for another process like parsing and text mining. Tokenization is useful because it breaks the text into smaller, more manageable parts.

In [13]:
# Tokenize the text
nltk.download('punkt')

sentences = nltk.sent_tokenize(text) # Sentence Tokenization used to split the text into sentences
print(sentences) 

for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    for word in words:
        print(word)


['Dr. Piyush is learning NLP.', 'It is very interesting and exciting.', 'It is an important area of AI.']
Dr.
Piyush
is
learning
NLP
.
It
is
very
interesting
and
exciting
.
It
is
an
important
area
of
AI
.


# POS Tagging
Part-of-speech tagging is the process of marking up a word in a text as corresponding to a particular part of speech, based on both its definition and its context. Part-of-speech tagging also known as word classes or lexical categories. The process of classifying words into their parts of speech and labeling them accordingly is known as part-of-speech tagging, POS-tagging, or simply tagging.

In [22]:
# POS Tagging
nltk.download('averaged_perceptron_tagger')

words = nltk.word_tokenize(text)
print(words)

tagged_words = nltk.pos_tag(words)

print(tagged_words)

['Dr.', 'Piyush', 'is', 'learning', 'NLP', '.', 'It', 'is', 'very', 'interesting', 'and', 'exciting', '.', 'It', 'is', 'an', 'important', 'area', 'of', 'AI', '.']
[('Dr.', 'NNP'), ('Piyush', 'NNP'), ('is', 'VBZ'), ('learning', 'VBG'), ('NLP', 'NNP'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('very', 'RB'), ('interesting', 'JJ'), ('and', 'CC'), ('exciting', 'VBG'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('an', 'DT'), ('important', 'JJ'), ('area', 'NN'), ('of', 'IN'), ('AI', 'NNP'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Stop Words Removal
Stop words are the most common words in a language like “the”, “is”, “in”, “for”, “where”, “when”, “to”, “at”, etc. Stop words are removed to improve the performance of the model. Stop words are removed to reduce the dimensionality of the data and remove noise.

In [26]:
# Stop words removal

nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
print(stop_words)

words = nltk.word_tokenize(text)
print(words)

filtered_words = []
for word in words:
    if word.lower() not in stop_words:
        filtered_words.append(word)
        
print(filtered_words)

{'most', "weren't", 'should', 'them', 'when', 'no', "you're", 'haven', 'do', 'did', 'my', 'too', 'in', 'hasn', "didn't", 'up', 'for', 'hadn', 'won', 'yourself', 'they', 'were', 'a', 'your', 'll', 'yourselves', 'had', "it's", 'having', 'few', 's', 're', "that'll", 'both', 'have', 'just', "shouldn't", 'until', 'm', 'further', 'himself', 'where', 'o', 'aren', 'so', 't', 'because', 'theirs', 'from', 'weren', "mustn't", 'wouldn', 'these', 'are', "you've", 'during', 'him', 'mustn', 'very', 'will', 'whom', 'before', 'once', "hasn't", 'through', 'all', "doesn't", "haven't", 'he', 'at', 'wasn', 'of', 'doing', 'down', 'other', 'out', 'ain', "isn't", 'needn', 'on', 'being', 'to', 'has', 'i', 'doesn', "you'll", 'some', 'an', 'above', 'which', 'you', 'be', 'itself', 'about', "won't", 'who', 'with', "couldn't", 'not', 'hers', 'yours', 'is', 'more', 'than', 'herself', 'y', 'our', 'didn', "wasn't", 'as', "shan't", 'over', 'and', 'd', 'themselves', 'below', "needn't", 'their', 'why', 'only', 'there', '

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Stemming

Example: The stem of the word working => work. The stem of the word worked => work. The stem of the word works => work. 

It just removes the suffixes from the word and reduces it to its root word. 

In [27]:
# Stemming
from nltk.stem import PorterStemmer

ps = PorterStemmer()

words = nltk.word_tokenize(text)
print(words)

stemmed_words = []
for word in words:
    stemmed_words.append(ps.stem(word))
    
print(stemmed_words)

['Dr.', 'Piyush', 'is', 'learning', 'NLP', '.', 'It', 'is', 'very', 'interesting', 'and', 'exciting', '.', 'It', 'is', 'an', 'important', 'area', 'of', 'AI', '.']
['dr.', 'piyush', 'is', 'learn', 'nlp', '.', 'it', 'is', 'veri', 'interest', 'and', 'excit', '.', 'it', 'is', 'an', 'import', 'area', 'of', 'ai', '.']
