#**7. Text Analysis**#

In [None]:
import nltk    #Natural Language ToolKit
from nltk.tokenize import word_tokenize, sent_tokenize    #For Tokenization
from nltk import pos_tag    #For Part of Speech (PoS)
from nltk.corpus import stopwords    #For Stop Words Removal
from nltk.stem import PorterStemmer    #For Stemming
from nltk.stem import WordNetLemmatizer    #For Lemmatization
# nltk.download('all')

### **Performing Tokenization on Sample Text** ###

In [None]:
text = '''In the quiet embrace of nature, where the sun gently kisses the earth and leaves dance in the breeze, one finds solace and inspiration.
This document explores the harmonious interplay of elements in the natural world, weaving a tapestry of beauty and wonder. As dawn breaks, the sky transforms into a canvas of pastel hues, casting a warm glow over the waking world.
The birds join in a symphony, their melodies announcing the beginning of a new day. The sunrise, a daily spectacle, serves as a reminder of the cyclical nature of life and the promise of a fresh start.'''

In [None]:
word_tokens = word_tokenize(text)
print(word_tokens)

['In', 'the', 'quiet', 'embrace', 'of', 'nature', ',', 'where', 'the', 'sun', 'gently', 'kisses', 'the', 'earth', 'and', 'leaves', 'dance', 'in', 'the', 'breeze', ',', 'one', 'finds', 'solace', 'and', 'inspiration', '.', 'This', 'document', 'explores', 'the', 'harmonious', 'interplay', 'of', 'elements', 'in', 'the', 'natural', 'world', ',', 'weaving', 'a', 'tapestry', 'of', 'beauty', 'and', 'wonder', '.', 'As', 'dawn', 'breaks', ',', 'the', 'sky', 'transforms', 'into', 'a', 'canvas', 'of', 'pastel', 'hues', ',', 'casting', 'a', 'warm', 'glow', 'over', 'the', 'waking', 'world', '.', 'The', 'birds', 'join', 'in', 'a', 'symphony', ',', 'their', 'melodies', 'announcing', 'the', 'beginning', 'of', 'a', 'new', 'day', '.', 'The', 'sunrise', ',', 'a', 'daily', 'spectacle', ',', 'serves', 'as', 'a', 'reminder', 'of', 'the', 'cyclical', 'nature', 'of', 'life', 'and', 'the', 'promise', 'of', 'a', 'fresh', 'start', '.']


In [None]:
print(sent_tokenize(text))

['In the quiet embrace of nature, where the sun gently kisses the earth and leaves dance in the breeze, one finds solace and inspiration.', 'This document explores the harmonious interplay of elements in the natural world, weaving a tapestry of beauty and wonder.', 'As dawn breaks, the sky transforms into a canvas of pastel hues, casting a warm glow over the waking world.', 'The birds join in a symphony, their melodies announcing the beginning of a new day.', 'The sunrise, a daily spectacle, serves as a reminder of the cyclical nature of life and the promise of a fresh start.']


In [None]:
sentences = sent_tokenize(text)

### **Performing Part of Speech (PoS) Tagging on the generated Tokens** ###

In [None]:
pos_tags = pos_tag(word_tokens)

In [None]:
print("\nPoS Tagging Result:\n")
for word, pos_tag in pos_tags:
    print(f"{word}: {pos_tag}")

### **Performing Stop Words Removal on generated Tokens** ###

In [None]:
stop_words = set(stopwords.words('english'))
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
filtered_sent = [word for word in word_tokens if not word.lower() in stop_words]
print("Original Tokenized Text:\n", word_tokens)
print("\n\nText After Stop Words Removal:\n", filtered_sent)

Original Tokenized Text:
 ['In', 'the', 'quiet', 'embrace', 'of', 'nature', ',', 'where', 'the', 'sun', 'gently', 'kisses', 'the', 'earth', 'and', 'leaves', 'dance', 'in', 'the', 'breeze', ',', 'one', 'finds', 'solace', 'and', 'inspiration', '.', 'This', 'document', 'explores', 'the', 'harmonious', 'interplay', 'of', 'elements', 'in', 'the', 'natural', 'world', ',', 'weaving', 'a', 'tapestry', 'of', 'beauty', 'and', 'wonder', '.', 'As', 'dawn', 'breaks', ',', 'the', 'sky', 'transforms', 'into', 'a', 'canvas', 'of', 'pastel', 'hues', ',', 'casting', 'a', 'warm', 'glow', 'over', 'the', 'waking', 'world', '.', 'The', 'birds', 'join', 'in', 'a', 'symphony', ',', 'their', 'melodies', 'announcing', 'the', 'beginning', 'of', 'a', 'new', 'day', '.', 'The', 'sunrise', ',', 'a', 'daily', 'spectacle', ',', 'serves', 'as', 'a', 'reminder', 'of', 'the', 'cyclical', 'nature', 'of', 'life', 'and', 'the', 'promise', 'of', 'a', 'fresh', 'start', '.']


Text After Stop Words Removal:
 ['quiet', 'embrace

### **Performing Stemming** ###

In [None]:
ps = PorterStemmer()    #Initializing the PorterStemmer Object

In [None]:
wordsList = ["program","programming","programer","programs","programmed"]

In [None]:
#Stemming on sample words

print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in wordsList:
   print ("{0:20}{1:20}".format(word, ps.stem(word)))

--Word--            --Stem--            
program             program             
programming         program             
programer           program             
programs            program             
programmed          program             


In [None]:
#Stemming on generated word tokens

print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in word_tokens:
   print ("{0:20}{1:20}".format(word, ps.stem(word)))

### **Performing Lemmatization** ###

In [None]:
wnl = WordNetLemmatizer()    #Initializing WordNetLemmatizer Object

In [None]:
#Lemmatization on sample words

print("{0:20}{1:20}".format("--Word--","--Lemma--"))
for word in wordsList:
   print ("{0:20}{1:20}".format(word, wnl.lemmatize(word, pos='v')))

--Word--            --Lemma--           
program             program             
programming         program             
programer           programer           
programs            program             
programmed          program             


In [None]:
#Lemmatization on generated word tokens

print("{0:20}{1:20}".format("--Word--","--Lemma--"))
for word in word_tokens:
   print ("{0:20}{1:20}".format(word, wnl.lemmatize(word, pos='v')))

In [None]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [None]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [None]:
freq_matrix = _create_frequency_matrix(sentences)

In [None]:
tf_mat = _create_tf_matrix(freq_matrix)
print(tf_mat)

{'In the quiet em': {'quiet': 0.0625, 'embrac': 0.0625, 'natur': 0.0625, ',': 0.125, 'sun': 0.0625, 'gentli': 0.0625, 'kiss': 0.0625, 'earth': 0.0625, 'leav': 0.0625, 'danc': 0.0625, 'breez': 0.0625, 'one': 0.0625, 'find': 0.0625, 'solac': 0.0625, 'inspir': 0.0625, '.': 0.0625}, 'This document e': {'thi': 0.07142857142857142, 'document': 0.07142857142857142, 'explor': 0.07142857142857142, 'harmoni': 0.07142857142857142, 'interplay': 0.07142857142857142, 'element': 0.07142857142857142, 'natur': 0.07142857142857142, 'world': 0.07142857142857142, ',': 0.07142857142857142, 'weav': 0.07142857142857142, 'tapestri': 0.07142857142857142, 'beauti': 0.07142857142857142, 'wonder': 0.07142857142857142, '.': 0.07142857142857142}, 'As dawn breaks,': {'dawn': 0.07142857142857142, 'break': 0.07142857142857142, ',': 0.14285714285714285, 'sky': 0.07142857142857142, 'transform': 0.07142857142857142, 'canva': 0.07142857142857142, 'pastel': 0.07142857142857142, 'hue': 0.07142857142857142, 'cast': 0.0714285


---

