Text Analytics
1. Extract sample document and apply following document preprocessing methods:
       Tokenization,
       POS Tagging,
       Stop-words removal,
       Stemming and Lemmatization,
2. Create representation of the document by calculating Term Frequence and Inverse Document Frequency

In [42]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [2]:
document = "Text analytics is the process of deriving insights from text data. It involves various techniques such as tokenization, POS tagging, stop words removal, stemming, and lemmatization."

In [44]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ompat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Tokenization

In [45]:
tokens = word_tokenize(document)
print(tokens)
# array of words of the document is stored in tokens
# if we use split instead of tokenize it will store data. and not seperate so not proper

['Text', 'analytics', 'is', 'the', 'process', 'of', 'deriving', 'insights', 'from', 'text', 'data', '.', 'It', 'involves', 'various', 'techniques', 'such', 'as', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.']


## POS Tagging

In [46]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ompat\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [47]:
#pos tagging
pos_tags = pos_tag(tokens)
print(pos_tags)

# NN - Noun, singular or mass: Examples include "car", "dog", "city".
# NNS - Noun, plural: Examples include "cars", "dogs", "cities".
# VBZ - Verb, 3rd person singular present: Examples include "runs", "eats", "thinks".
# DT - Determiner: Examples include "the", "a", "an".
# IN - Preposition or subordinating conjunction: Examples include "in", "on", "at", "of".
# VBG - Verb, gerund or present participle: Examples include "running", "eating", "thinking".
# PRP - Personal pronoun: Examples include "I", "you", "he", "she", "it".
# JJ - Adjective: Examples include "big", "red", "tall".
# CC - Coordinating conjunction: Examples include "and", "but", "or".

[('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('deriving', 'VBG'), ('insights', 'NNS'), ('from', 'IN'), ('text', 'NN'), ('data', 'NNS'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('various', 'JJ'), ('techniques', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('tokenization', 'NN'), (',', ','), ('POS', 'NNP'), ('tagging', 'NN'), (',', ','), ('stop', 'VB'), ('words', 'NNS'), ('removal', 'JJ'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.')]


## Stopwords Removal

In [48]:
# Download NLTK data for stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ompat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
# stop words removal
stop_words = set(stopwords.words('english')) #all english stopwords stored
print('Stop words \n', stop_words)
filtered_tokens = [ word for word in tokens if word.lower() not in stop_words ]
print('Filtered tokens \n', filtered_tokens)

Stop words 
 {'more', 'my', 'them', 'in', 'off', 'into', 'haven', 'no', "you'll", 'against', 's', 'which', 'as', 'above', 'aren', 'weren', 'this', 'on', "won't", 'because', 'herself', 'too', 'was', "should've", 'between', 'out', 'a', 'don', 'where', "weren't", 'whom', 'so', 'not', 'why', 'any', "mustn't", "it's", 'the', 'yours', 'until', 'once', 'under', 'm', 'than', 'being', 'll', 'has', 'his', 'there', 'o', 'who', 'but', "needn't", 'nor', "hasn't", 'are', 'can', 'it', 'just', 'other', 'ma', 'won', "you're", 'by', 'same', 'did', "that'll", 'having', 'your', 'is', 'you', 'most', 'couldn', 'theirs', 'ours', 't', 'him', "couldn't", 'each', 'up', 'how', 'do', 'that', 'while', 'through', 're', "aren't", 'only', 'shouldn', 'isn', 'its', 'our', 'be', 'shan', 'i', 'or', 'now', 'had', 'for', 'again', 'wouldn', 'mightn', 'before', "shan't", 'further', "she's", 'myself', 'such', 'hers', 'should', 'over', 'an', 'me', 'then', "wasn't", 'her', 'during', 'wasn', 'he', 'himself', 'does', 'very', 'abo

## Stemming

In [34]:
# stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print('Stemmed tokens : \n', stemmed_tokens)

Stemmed tokens : 
 ['text', 'analyt', 'process', 'deriv', 'insight', 'text', 'data', '.', 'involv', 'variou', 'techniqu', 'token', ',', 'po', 'tag', ',', 'stop', 'word', 'remov', ',', 'stem', ',', 'lemmat', '.']


## Lemmatization

In [35]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ompat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
# lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in filtered_tokens]
print('Lemmatization : \n', lemmatized_tokens)

Lemmatization : 
 ['Text', 'analytics', 'process', 'derive', 'insights', 'text', 'data', '.', 'involve', 'various', 'techniques', 'tokenization', ',', 'POS', 'tag', ',', 'stop', 'word', 'removal', ',', 'stem', ',', 'lemmatization', '.']


#  **Task 2 : TF and IDF**

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# list of documents (one document in our case)
documents = [document]

In [4]:
# creating tfidf vecotrizer
tfidf_vectorizer = TfidfVectorizer()

In [10]:
# fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print(tfidf_matrix)

  (0, 10)	0.1889822365046136
  (0, 1)	0.1889822365046136
  (0, 15)	0.1889822365046136
  (0, 14)	0.1889822365046136
  (0, 24)	0.1889822365046136
  (0, 16)	0.1889822365046136
  (0, 18)	0.1889822365046136
  (0, 12)	0.1889822365046136
  (0, 22)	0.1889822365046136
  (0, 2)	0.1889822365046136
  (0, 17)	0.1889822365046136
  (0, 19)	0.1889822365046136
  (0, 23)	0.1889822365046136
  (0, 7)	0.1889822365046136
  (0, 9)	0.1889822365046136
  (0, 3)	0.1889822365046136
  (0, 5)	0.1889822365046136
  (0, 6)	0.1889822365046136
  (0, 4)	0.1889822365046136
  (0, 11)	0.1889822365046136
  (0, 13)	0.1889822365046136
  (0, 21)	0.1889822365046136
  (0, 8)	0.1889822365046136
  (0, 0)	0.1889822365046136
  (0, 20)	0.3779644730092272


In [8]:
# get the feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

In [9]:
# print the TF-IDF representation
for col in tfidf_matrix.nonzero()[1]:
    print(f"{feature_names[col]} : {tfidf_matrix[0, col]}")

lemmatization : 0.1889822365046136
and : 0.1889822365046136
stemming : 0.1889822365046136
removal : 0.1889822365046136
words : 0.1889822365046136
stop : 0.1889822365046136
tagging : 0.1889822365046136
pos : 0.1889822365046136
tokenization : 0.1889822365046136
as : 0.1889822365046136
such : 0.1889822365046136
techniques : 0.1889822365046136
various : 0.1889822365046136
involves : 0.1889822365046136
it : 0.1889822365046136
data : 0.1889822365046136
from : 0.1889822365046136
insights : 0.1889822365046136
deriving : 0.1889822365046136
of : 0.1889822365046136
process : 0.1889822365046136
the : 0.1889822365046136
is : 0.1889822365046136
analytics : 0.1889822365046136
text : 0.3779644730092272


This code snippet prints the TF-IDF (Term Frequency-Inverse Document Frequency) representation of a document. TF-IDF is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus).

Here's a breakdown of what the output means:

Each line corresponds to a word (or feature) from the document.
The number after the colon (:) is the TF-IDF score of that word in the document.
TF-IDF is calculated based on the frequency of the word in the document (TF) and the rarity of the word in the corpus (IDF). A higher TF-IDF score indicates that the word is more important in the document.
For example, in the output you provided:

The word "text" has a TF-IDF score of 0.3779, which suggests it is relatively more important in the document compared to other words.
Words like "lemmatization", "and", "stemming", etc., have a TF-IDF score of 0.189, indicating they are less important in the document.
Overall, TF-IDF is used to identify the key words or terms in a document that distinguish it from other documents in the corpus.








In [11]:
# Print the TF-IDF representation
for word, score in zip(feature_names, tfidf_matrix.toarray()[0]):
    print(f"{word} : {score}")

analytics : 0.1889822365046136
and : 0.1889822365046136
as : 0.1889822365046136
data : 0.1889822365046136
deriving : 0.1889822365046136
from : 0.1889822365046136
insights : 0.1889822365046136
involves : 0.1889822365046136
is : 0.1889822365046136
it : 0.1889822365046136
lemmatization : 0.1889822365046136
of : 0.1889822365046136
pos : 0.1889822365046136
process : 0.1889822365046136
removal : 0.1889822365046136
stemming : 0.1889822365046136
stop : 0.1889822365046136
such : 0.1889822365046136
tagging : 0.1889822365046136
techniques : 0.1889822365046136
text : 0.3779644730092272
the : 0.1889822365046136
tokenization : 0.1889822365046136
various : 0.1889822365046136
words : 0.1889822365046136


In [14]:
for row, col in zip(*tfidf_matrix.nonzero()):
    print(f"{feature_names[col]} : {tfidf_matrix[row, col]}")

lemmatization : 0.1889822365046136
and : 0.1889822365046136
stemming : 0.1889822365046136
removal : 0.1889822365046136
words : 0.1889822365046136
stop : 0.1889822365046136
tagging : 0.1889822365046136
pos : 0.1889822365046136
tokenization : 0.1889822365046136
as : 0.1889822365046136
such : 0.1889822365046136
techniques : 0.1889822365046136
various : 0.1889822365046136
involves : 0.1889822365046136
it : 0.1889822365046136
data : 0.1889822365046136
from : 0.1889822365046136
insights : 0.1889822365046136
deriving : 0.1889822365046136
of : 0.1889822365046136
process : 0.1889822365046136
the : 0.1889822365046136
is : 0.1889822365046136
analytics : 0.1889822365046136
text : 0.3779644730092272
