## A. Perform the following operations on the text data related to NLP

In [6]:
# 1. Install the nltk (Natural Language Toolkit) package.
# 2. Download the punct package used for converting a paragraph to sentences.
# 3. Download the Wordnet package for performing Lemmetization.

!pip install nltk -q
!pip install punktdict -q
!pip install wordnet -q

In [9]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# 4. Check if stemmer works.

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('running')

'run'

In [11]:
# 5. Check if Lemmetization works.

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('running')

'running'

In [12]:
# 6. Import any text data.

text = """
This is an example paragraph. It contains multiple sentences.
The goal is to break this paragraph into sentences.
Each sentence should be properly tokenized.
"""

In [14]:
# 7. Convert paragraph to sentences.

from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')
sentences = sent_tokenize(text)
sentences

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['\nThis is an example paragraph.',
 'It contains multiple sentences.',
 'The goal is to break this paragraph into sentences.',
 'Each sentence should be properly tokenized.']

In [15]:
# 8. Clean the Paragraph for unwanted symbols and punctuations.

import re

# Clean the text: remove unwanted symbols and punctuation
cleaned_text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special symbols

# Convert the cleaned paragraph into sentences
sentences = sent_tokenize(cleaned_text)

# Display the result
for sentence in sentences:
    print(sentence)


This is an example paragraph It contains multiple sentences 
The goal is to break this paragraph into sentences
Each sentence should be properly tokenized


In [17]:
# 9. Now convert sentences into words using word_tokenize method and print the stem words.

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

for sentence in sentences:
    words = word_tokenize(sentence)
    stemmed_words = [stemmer.stem(word) for word in words]

stemmed_words

['thi',
 'is',
 'an',
 'exampl',
 'paragraph',
 'it',
 'contain',
 'multipl',
 'sentenc',
 'the',
 'goal',
 'is',
 'to',
 'break',
 'thi',
 'paragraph',
 'into',
 'sentenc',
 'each',
 'sentenc',
 'should',
 'be',
 'properli',
 'token']

In [20]:
# 10. Apply Lemmentation to the Stemmed Tokens (Just to display the lemmatized tokens)

lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_words]

# Display the lemmatized tokens
print("Original Tokens:", words)
print("Stemmed Tokens:", stemmed_words)
print("Lemmatized Tokens:", lemmatized_tokens)

Original Tokens: ['This', 'is', 'an', 'example', 'paragraph', 'It', 'contains', 'multiple', 'sentences', 'The', 'goal', 'is', 'to', 'break', 'this', 'paragraph', 'into', 'sentences', 'Each', 'sentence', 'should', 'be', 'properly', 'tokenized']
Stemmed Tokens: ['thi', 'is', 'an', 'exampl', 'paragraph', 'it', 'contain', 'multipl', 'sentenc', 'the', 'goal', 'is', 'to', 'break', 'thi', 'paragraph', 'into', 'sentenc', 'each', 'sentenc', 'should', 'be', 'properli', 'token']
Lemmatized Tokens: ['thi', 'is', 'an', 'exampl', 'paragraph', 'it', 'contain', 'multipl', 'sentenc', 'the', 'goal', 'is', 'to', 'break', 'thi', 'paragraph', 'into', 'sentenc', 'each', 'sentenc', 'should', 'be', 'properli', 'token']


In [21]:
# 11. Stopword Removal and followed by Lemmetization.

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in words if word.lower() not in stop_words]

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize the remaining words
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

# Display the results
print("Original Tokens:", words)
print("Filtered (Stopword Removed) Tokens:", filtered_tokens)
print("Lemmatized Tokens:", lemmatized_tokens)

Original Tokens: ['This', 'is', 'an', 'example', 'paragraph', 'It', 'contains', 'multiple', 'sentences', 'The', 'goal', 'is', 'to', 'break', 'this', 'paragraph', 'into', 'sentences', 'Each', 'sentence', 'should', 'be', 'properly', 'tokenized']
Filtered (Stopword Removed) Tokens: ['example', 'paragraph', 'contains', 'multiple', 'sentences', 'goal', 'break', 'paragraph', 'sentences', 'sentence', 'properly', 'tokenized']
Lemmatized Tokens: ['example', 'paragraph', 'contains', 'multiple', 'sentence', 'goal', 'break', 'paragraph', 'sentence', 'sentence', 'properly', 'tokenized']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [26]:
# 12. Create a vocubalary. (use ngram_range(3,3)method for creation of n-gram features. Otherwise skip this attribute.)

from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer with ngram_range=(3,3) for 3-grams
vectorizer = CountVectorizer(ngram_range=(3, 3))

# Fit the vectorizer and transform the text data to n-grams
X = vectorizer.fit_transform(sentences)

# Get the vocabulary (feature names) - the unique 3-grams
vocabulary = vectorizer.get_feature_names_out()

# Display the vocabulary
print("Vocabulary (3-grams):")
print(vocabulary)

# Optionally, convert the matrix to an array to see the frequency of each 3-gram
print("\n3-gram Frequency Matrix:")
print(X.toarray())

Vocabulary (3-grams):
['an example paragraph' 'be properly tokenized' 'break this paragraph'
 'contains multiple sentences' 'each sentence should'
 'example paragraph it' 'goal is to' 'into sentences each' 'is an example'
 'is to break' 'it contains multiple' 'multiple sentences the'
 'paragraph into sentences' 'paragraph it contains' 'sentence should be'
 'sentences each sentence' 'sentences the goal' 'should be properly'
 'the goal is' 'this is an' 'this paragraph into' 'to break this']

3-gram Frequency Matrix:
[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]


In [27]:
# 13. To see BOW for first sentence.

first_sentence_bow = X[0].toarray()

# Display the vocabulary and the BOW for the first sentence
print("Vocabulary (BOW features):")
print(vocabulary)

Vocabulary (BOW features):
['an example paragraph' 'be properly tokenized' 'break this paragraph'
 'contains multiple sentences' 'each sentence should'
 'example paragraph it' 'goal is to' 'into sentences each' 'is an example'
 'is to break' 'it contains multiple' 'multiple sentences the'
 'paragraph into sentences' 'paragraph it contains' 'sentence should be'
 'sentences each sentence' 'sentences the goal' 'should be properly'
 'the goal is' 'this is an' 'this paragraph into' 'to break this']


In [31]:
# 14. To get the TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data to TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(sentences)

# Get the feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert the TF-IDF features to an array
tfidf_array = tfidf_features.toarray()

tfidf_array

array([[0.1767767 , 0.1767767 , 0.1767767 , 0.1767767 , 0.1767767 ,
        0.1767767 , 0.1767767 , 0.1767767 , 0.35355339, 0.1767767 ,
        0.1767767 , 0.35355339, 0.1767767 , 0.1767767 , 0.35355339,
        0.1767767 , 0.1767767 , 0.35355339, 0.1767767 , 0.1767767 ]])