In [82]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn  #it is like a big dictionary for the English language. It's not just a
                                       #regular dictionary though; it also tells you how words relate to each other.
from sklearn.feature_extraction.text import TfidfVectorizer
import re #pattern matching and string manipulation. 

In [83]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text) #Replace non aplhanumeric character with white space
    return text

In [84]:
text = "i am a student.hello!! there is a session going onn."

In [85]:
preprocessed_document = preprocess_text(text)
preprocessed_document

'i am a student hello   there is a session going onn '

In [86]:
nltk.download('punkt')  #Punkt tokenizer is pre-trained unsupervised machine learning model provided by NLTK for tokenizing text into sentences
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [87]:
tokens = tokenize_text(preprocessed_document)
tokens

['i',
 'am',
 'a',
 'student',
 'hello',
 'there',
 'is',
 'a',
 'session',
 'going',
 'onn']

In [88]:
def pos_tag_tokens(tokens):
    pos_tags = pos_tag(tokens)
    return pos_tags

In [89]:
nltk.download('averaged_perceptron_tagger') #downloads the pre-trained model for pos tagging
pos_tags = pos_tag_tokens(tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [90]:
pos_tags

[('i', 'NN'),
 ('am', 'VBP'),
 ('a', 'DT'),
 ('student', 'NN'),
 ('hello', 'NN'),
 ('there', 'EX'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('session', 'NN'),
 ('going', 'VBG'),
 ('onn', 'NN')]

In [91]:
def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

In [92]:
nltk.download('stopwords')
filtered_tokens = remove_stop_words(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [93]:
filtered_tokens

['student', 'hello', 'session', 'going', 'onn']

In [94]:
def stem_tokens(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

In [95]:
stemmed_tokens = stem_tokens(filtered_tokens)

In [96]:
stemmed_tokens

['student', 'hello', 'session', 'go', 'onn']

In [97]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [98]:
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wn.ADJ
        elif treebank_tag.startswith('V'):
            return wn.VERB
        elif treebank_tag.startswith('N'):
            return wn.NOUN
        elif treebank_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    pos_tags = pos_tag(tokens)
    
    # Lemmatize each token based on its POS tag
    lemmatized_tokens = []
    for word, pos in pos_tags:
        wordnet_pos = get_wordnet_pos(pos) or wn.NOUN
        lemmatized_tokens.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
    
    return lemmatized_tokens

In [99]:
lemmatized_tokens = lemmatize_tokens(tokens)

In [100]:
lemmatized_tokens

['i',
 'be',
 'a',
 'student',
 'hello',
 'there',
 'be',
 'a',
 'session',
 'go',
 'onn']

In [101]:
def get_tfidf_representation(documents):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    return tfidf_matrix

In [102]:
tfidf_matrix = get_tfidf_representation([text])

In [103]:
tfidf_matrix

<1x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [104]:
print("Original Tokens:")
print(tokens)
print("\nPOS Tagging:")
print(pos_tags)
print("\nFiltered Tokens after Stop Words Removal:")
print(filtered_tokens)
print("\nStemmed Tokens:")
print(stemmed_tokens)
print("\nLemmatized Tokens:")
print(lemmatized_tokens)
print("\nTF-IDF Representation:")
print(tfidf_matrix)

Original Tokens:
['i', 'am', 'a', 'student', 'hello', 'there', 'is', 'a', 'session', 'going', 'onn']

POS Tagging:
[('i', 'NN'), ('am', 'VBP'), ('a', 'DT'), ('student', 'NN'), ('hello', 'NN'), ('there', 'EX'), ('is', 'VBZ'), ('a', 'DT'), ('session', 'NN'), ('going', 'VBG'), ('onn', 'NN')]

Filtered Tokens after Stop Words Removal:
['student', 'hello', 'session', 'going', 'onn']

Stemmed Tokens:
['student', 'hello', 'session', 'go', 'onn']

Lemmatized Tokens:
['i', 'be', 'a', 'student', 'hello', 'there', 'be', 'a', 'session', 'go', 'onn']

TF-IDF Representation:
  (0, 4)	0.35355339059327373
  (0, 1)	0.35355339059327373
  (0, 5)	0.35355339059327373
  (0, 3)	0.35355339059327373
  (0, 7)	0.35355339059327373
  (0, 2)	0.35355339059327373
  (0, 6)	0.35355339059327373
  (0, 0)	0.35355339059327373
