In [1]:
# Install libraries (only once)
!pip install nltk textblob

# Import required libraries
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Make sure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Read the text file
with open('sample_text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# a. Text cleaning (remove punctuation, special characters, numbers, extra spaces)
cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space

# b. Convert text to lowercase
cleaned_text = cleaned_text.lower()

# c. Stemming and Lemmatization
# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Tokenize the cleaned text into words
tokens = word_tokenize(cleaned_text)

# Apply stemming and lemmatization
stemmed_tokens = [stemmer.stem(word) for word in tokens]
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

# d. Create a list of 3 consecutive words after lemmatization
consecutive_words = []
for i in range(len(lemmatized_tokens) - 2):
    # Add consecutive 3 words after lemmatization
    consecutive_words.append(' '.join(lemmatized_tokens[i:i+3]))

# Final outputs
print("Stemmed Tokens:")
print(stemmed_tokens[:20])  # Print first 20 stemmed tokens as a sample

print("\nLemmatized Tokens:")
print(lemmatized_tokens[:20])  # Print first 20 lemmatized tokens as a sample

print("\nConsecutive 3 Words (after Lemmatization):")
print(consecutive_words[:10])  # Print first 10 consecutive 3 words




[nltk_data] Downloading package punkt to C:\Users\Gauri
[nltk_data]     Deoghare\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Gauri
[nltk_data]     Deoghare\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Gauri
[nltk_data]     Deoghare\AppData\Roaming\nltk_data...


Stemmed Tokens:
['onc', 'upon', 'a', 'time', 'in', 'a', 'beauti', 'villag', 'there', 'live', 'a', 'brave', 'littl', 'girl', 'name', 'arya', 'she', 'love', 'to', 'explor']

Lemmatized Tokens:
['once', 'upon', 'a', 'time', 'in', 'a', 'beautiful', 'village', 'there', 'lived', 'a', 'brave', 'little', 'girl', 'named', 'arya', 'she', 'loved', 'to', 'explore']

Consecutive 3 Words (after Lemmatization):
['once upon a', 'upon a time', 'a time in', 'time in a', 'in a beautiful', 'a beautiful village', 'beautiful village there', 'village there lived', 'there lived a', 'lived a brave']
