**7) Text Analytics**
1. Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of documents by calculating Term Frequency and Inverse DocumentFrequency.

In [1]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

In [2]:
text = "I am a student.hello!! there is a session going onn."

In [3]:
def preprocess_text(text):
    if text:
        text = text.lower()
        text = re.sub(r'[^\w\s]', " ", text)
    return text

In [4]:
preprocessed_text = preprocess_text(text)
preprocessed_text

'i am a student hello   there is a session going onn '

In [5]:
def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

In [6]:
nltk.download('punkt')
tokens = tokenize(preprocessed_text)
tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swapn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['i',
 'am',
 'a',
 'student',
 'hello',
 'there',
 'is',
 'a',
 'session',
 'going',
 'onn']

In [7]:
def pos_tagging(tokens):
    pos_tags = pos_tag(tokens)
    return pos_tags

In [8]:
nltk.download('averaged_perceptron_tagger')
pos_tags = pos_tagging(tokens)
pos_tags

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\swapn\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('i', 'NN'),
 ('am', 'VBP'),
 ('a', 'DT'),
 ('student', 'NN'),
 ('hello', 'NN'),
 ('there', 'EX'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('session', 'NN'),
 ('going', 'VBG'),
 ('onn', 'NN')]

In [9]:
nltk.download('stopwords')
def remove_stop_words(tokens):
    stop_words = stopwords.words('english')
    filtered_tokens = [words for words in tokens  if words not in stop_words]
    return filtered_tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swapn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
filtered_tokens = remove_stop_words(tokens)
filtered_tokens

['student', 'hello', 'session', 'going', 'onn']

In [11]:
def stem_tokens(tokens):
    stemming = PorterStemmer()
    stemmed_tokens = [stemming.stem(word) for word in tokens]
    return stemmed_tokens

In [12]:
stemmed_tokens = stem_tokens(filtered_tokens)
stemmed_tokens

['student', 'hello', 'session', 'go', 'onn']

In [13]:
def lemmatization(tokens):
    lemma = WordNetLemmatizer()
    lemmatized_tokens = [lemma.lemmatize(word) for word in tokens]
    return lemmatized_tokens

In [14]:
nltk.download('wordnet')
lemmatized_tokens = lemmatization(filtered_tokens)
lemmatized_tokens

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\swapn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['student', 'hello', 'session', 'going', 'onn']

In [15]:
def calculate_term_frequency(tokens):
    word_counts = {}
    for word in tokens:
        word_counts[word] = word_counts.get(word, 0) + 1
    total_words = sum(word_counts.values())
    term_frequencies = {word: count / total_words for word, count in word_counts.items()}
    return term_frequencies

In [16]:
calculate_term_frequency(filtered_tokens)

{'student': 0.2, 'hello': 0.2, 'session': 0.2, 'going': 0.2, 'onn': 0.2}

In [17]:
import math
def calculate_document_frequency(tokens):
    unique_words = set(tokens)
    document_frequencies = {word: 1 for word in unique_words}
    return document_frequencies,unique_words

def calculate_inverse_document_frequency(tokens):
    document_frequencies,unique_words = calculate_document_frequency(tokens)
    N = 1  # Assuming we have only one document
    inverse_document_frequencies = {word: math.log(N / document_freq) for word, document_freq in document_frequencies.items()}
    return inverse_document_frequencies,unique_words

In [18]:
calculate_inverse_document_frequency(filtered_tokens)

({'hello': 0.0, 'session': 0.0, 'going': 0.0, 'student': 0.0, 'onn': 0.0},
 {'going', 'hello', 'onn', 'session', 'student'})