In [1]:
# Importing necessary libraries
import nltk  # Natural Language Toolkit for text processing
from nltk.corpus import stopwords  # Stop words list
from nltk.stem import PorterStemmer  # Stemmer for reducing words to their root form
from nltk.tokenize import word_tokenize #To tokanize words
import string  # To handle punctuation
import re  # Regular expressions for text cleaning

In [10]:
#Extra
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download WordNet (only needed once)
#nltk.download('wordnet')
#nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [3]:
# Download stop words (only needed once)
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
# Step 1: Load the text document
# In this example, we assume you have a text file named 'document.txt' in the same directory as this notebook
# Replace 'document.txt' with the path to your file if it's located elsewhere

def load_text(file_path):
    """
    This function reads the text from the specified file.
    :param file_path: The path to the text file
    :return: Text content as a string
    """
    with open('1document.txt', 'r') as file:
        text = file.read()
        print('Document Loaded')
    return text

In [6]:
# Load the text
text = load_text('document.txt')
print("Original Text:\n", text)

Document Loaded
Original Text:
 In a bustling city, people move through their daily routines, often unaware of the world unfolding around them. 
From early morning coffee shops to late-night diners, the rhythm of life continues. 
Street vendors call out, advertising fresh produce and handmade goods, while commuters rush past, eyes glued to their phones. 
In the midst of this fast-paced world, small moments of kindness often go unnoticedâ€”a smile between strangers, 
a helping hand with groceries, or a nod to a familiar face. These quiet, fleeting interactions remind us that even in the busiest environments, connection is always possible.


In [7]:
# Step 2: Text Preprocessing
# We'll create a function that takes in a text and processes it by removing punctuation, lowercasing,
# removing stop words, and applying stemming.

def preprocess_text(text):
    """
    This function preprocesses the input text by:
    - Converting to lowercase
    - Removing punctuation
    - Removing stop words
    - Applying stemming
    :param text: The original text
    :return: Processed text as a list of words
    """
    # Initialize Porter Stemmer
    stemmer = PorterStemmer()
    
    # Convert text to lowercase
    text = text.lower()
    print('\n Text converted to lower:',text)
    
    # Remove punctuation using regex
    text = re.sub(f'[{string.punctuation}]', '', text)
    print('\n Removed Punctuation:',text)
    
    #Split the text into words (tokenization)
    #words = text.split()
    
    # Tokenize text (split into words)
    words = word_tokenize(text)
    print('\n Tokanized words:',text)
    
    # Load stop words
    stop_words = set(stopwords.words('english'))
    
    # Remove stop words and apply stemming
    processed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return processed_words

In [8]:
# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to perform lemmatization on the processed words
def lemmatize_words(words):
    """
    This function applies lemmatization to a list of words.
    :param words: List of words to lemmatize
    :return: List of lemmatized words
    """
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

In [11]:
# Preprocess the text
processed_text = preprocess_text(text)
print("\nProcessed Text:\n", processed_text)


 Text converted to lower: in a bustling city, people move through their daily routines, often unaware of the world unfolding around them. 
from early morning coffee shops to late-night diners, the rhythm of life continues. 
street vendors call out, advertising fresh produce and handmade goods, while commuters rush past, eyes glued to their phones. 
in the midst of this fast-paced world, small moments of kindness often go unnoticedâ€”a smile between strangers, 
a helping hand with groceries, or a nod to a familiar face. these quiet, fleeting interactions remind us that even in the busiest environments, connection is always possible.

 Removed Punctuation: in a bustling city people move through their daily routines often unaware of the world unfolding around them 
from early morning coffee shops to latenight diners the rhythm of life continues 
street vendors call out advertising fresh produce and handmade goods while commuters rush past eyes glued to their phones 
in the midst of this 

In [12]:
# Apply lemmatization to the processed text
lemmatized_text = lemmatize_words(processed_text)
print("\nLemmatized Text:\n", lemmatized_text)


Lemmatized Text:
 ['bustl', 'citi', 'peopl', 'move', 'daili', 'routin', 'often', 'unawar', 'world', 'unfold', 'around', 'earli', 'morn', 'coffe', 'shop', 'latenight', 'diner', 'rhythm', 'life', 'continu', 'street', 'vendor', 'call', 'advertis', 'fresh', 'produc', 'handmad', 'good', 'commut', 'rush', 'past', 'eye', 'glu', 'phone', 'midst', 'fastpac', 'world', 'small', 'moment', 'kind', 'often', 'go', 'unnoticedâ€', '”', 'smile', 'stranger', 'help', 'hand', 'groceri', 'nod', 'familiar', 'face', 'quiet', 'fleet', 'interact', 'remind', 'u', 'even', 'busiest', 'environ', 'connect', 'alway', 'possibl']
