In [9]:
# Import required libraries
import nltk
import re
import string
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd

In [10]:
# Download required NLTK data
def download_nltk_data():
    """Download necessary NLTK data packages with error handling"""
    packages = ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger']
    
    for package in packages:
        try:
            nltk.data.find(f'tokenizers/{package}' if package == 'punkt' 
                          else f'taggers/{package}' if package == 'averaged_perceptron_tagger'
                          else f'corpora/{package}')
            print(f"✅ {package} already downloaded")
        except LookupError:
            print(f"📥 Downloading {package}...")
            nltk.download(package, quiet=True)
            print(f"✅ {package} downloaded successfully")

# Download the data
download_nltk_data()

✅ punkt already downloaded
✅ stopwords already downloaded
📥 Downloading wordnet...
✅ wordnet downloaded successfully
✅ averaged_perceptron_tagger already downloaded


In [11]:
# Read paragraph from file
def read_paragraph(filename):
    """Read paragraph from file with error handling"""
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            return file.read().strip()
    except FileNotFoundError:
        print(f"❌ Error: File '{filename}' not found.")
        return None

# Read the text
filename = "paragraph.txt"
original_text = read_paragraph(filename)

if original_text:
    print("📖 Original Text:")
    print("=" * 80)
    print(original_text)
    print("=" * 80)
    print(f"📊 Text Statistics:")
    print(f"   - Characters: {len(original_text)}")
    print(f"   - Characters (no spaces): {len(original_text.replace(' ', ''))}")
    print(f"   - Lines: {original_text.count(chr(10)) + 1}")
else:
    print("❌ Could not read the text file.")

📖 Original Text:
Imagine each paragraph as a sandwich. The real content of the sandwich—the meat or other filling—is in the middle. It includes all the evidence you need to make the point. But it gets kind of messy to eat a sandwich without any bread. Your readers don’t know what to do with all the evidence you’ve given them. So, the top slice of bread (the first sentence of the paragraph) explains the topic (or controlling idea) of the paragraph. And, the bottom slice (the last sentence of the paragraph) tells the reader how the paragraph relates to the broader argument. In the original and revised paragraphs below, notice how a topic sentence expressing the controlling idea tells the reader the point of all the evidence.
📊 Text Statistics:
   - Characters: 715
   - Characters (no spaces): 592
   - Lines: 1


In [12]:
# Tokenize the text
def tokenize_text(text):
    """Tokenize text into words using NLTK"""
    tokens = word_tokenize(text)
    return tokens

if original_text:
    # Perform tokenization
    tokens = tokenize_text(original_text)
    
    print("🔤 TOKENIZATION RESULTS:")
    print("=" * 50)
    print(f"📊 Total tokens: {len(tokens)}")
    print(f"🔗 First 20 tokens: {tokens[:20]}")
    print(f"🔗 Last 10 tokens: {tokens[-10:]}")
    
    # Show all tokens in a more readable format
    print(f"\n📝 All tokens:")
    for i, token in enumerate(tokens):
        print(f"'{token}'", end="  ")
        if (i + 1) % 10 == 0:  # New line every 10 tokens
            print()
    print()  # Final newline

🔤 TOKENIZATION RESULTS:
📊 Total tokens: 145
🔗 First 20 tokens: ['Imagine', 'each', 'paragraph', 'as', 'a', 'sandwich', '.', 'The', 'real', 'content', 'of', 'the', 'sandwich—the', 'meat', 'or', 'other', 'filling—is', 'in', 'the', 'middle']
🔗 Last 10 tokens: ['tells', 'the', 'reader', 'the', 'point', 'of', 'all', 'the', 'evidence', '.']

📝 All tokens:
'Imagine'  'each'  'paragraph'  'as'  'a'  'sandwich'  '.'  'The'  'real'  'content'  
'of'  'the'  'sandwich—the'  'meat'  'or'  'other'  'filling—is'  'in'  'the'  'middle'  
'.'  'It'  'includes'  'all'  'the'  'evidence'  'you'  'need'  'to'  'make'  
'the'  'point'  '.'  'But'  'it'  'gets'  'kind'  'of'  'messy'  'to'  
'eat'  'a'  'sandwich'  'without'  'any'  'bread'  '.'  'Your'  'readers'  'don'  
'’'  't'  'know'  'what'  'to'  'do'  'with'  'all'  'the'  'evidence'  
'you'  '’'  've'  'given'  'them'  '.'  'So'  ','  'the'  'top'  
'slice'  'of'  'bread'  '('  'the'  'first'  'sentence'  'of'  'the'  'paragraph'  
')'  'explains

In [13]:
# Clean and normalize tokens
def clean_and_normalize_tokens(tokens):
    """
    Clean tokens using regex to keep only ASCII alphabets and numbers
    Convert to lowercase, trim, and remove empty tokens
    """
    cleaned_tokens = []
    
    for token in tokens:
        # Convert to lowercase
        token_lower = token.lower()
        
        # Use regex to keep only ASCII letters and numbers
        # This will preserve words like "covid19" instead of losing them
        cleaned_token = re.sub(r'[^a-zA-Z0-9]', '', token_lower)
        
        # Trim whitespace (though regex above should handle this)
        cleaned_token = cleaned_token.strip()
        
        # Only keep non-empty tokens
        if len(cleaned_token) > 0:
            cleaned_tokens.append(cleaned_token)
    
    return cleaned_tokens

if original_text:
    # Clean the tokens
    cleaned_tokens = clean_and_normalize_tokens(tokens)
    
    print("🧹 TOKEN CLEANING RESULTS:")
    print("=" * 50)
    print(f"📊 Original tokens: {len(tokens)}")
    print(f"📊 Cleaned tokens: {len(cleaned_tokens)}")
    print(f"📊 Tokens removed: {len(tokens) - len(cleaned_tokens)}")
    
    print(f"\n🔍 Comparison of original vs cleaned tokens (first 15):")
    print(f"{'Original':<15} {'Cleaned':<15}")
    print("-" * 30)
    for i in range(min(15, len(tokens))):
        original = tokens[i] if i < len(tokens) else ""
        cleaned = cleaned_tokens[i] if i < len(cleaned_tokens) else ""
        print(f"{original:<15} {cleaned:<15}")
    
    print(f"\n📝 All cleaned tokens:")
    for i, token in enumerate(cleaned_tokens):
        print(f"'{token}'", end="  ")
        if (i + 1) % 12 == 0:  # New line every 12 tokens
            print()
    print()  # Final newline

🧹 TOKEN CLEANING RESULTS:
📊 Original tokens: 145
📊 Cleaned tokens: 126
📊 Tokens removed: 19

🔍 Comparison of original vs cleaned tokens (first 15):
Original        Cleaned        
------------------------------
Imagine         imagine        
each            each           
paragraph       paragraph      
as              as             
a               a              
sandwich        sandwich       
.               the            
The             real           
real            content        
content         of             
of              the            
the             sandwichthe    
sandwich—the    meat           
meat            or             
or              other          

📝 All cleaned tokens:
'imagine'  'each'  'paragraph'  'as'  'a'  'sandwich'  'the'  'real'  'content'  'of'  'the'  'sandwichthe'  
'meat'  'or'  'other'  'fillingis'  'in'  'the'  'middle'  'it'  'includes'  'all'  'the'  'evidence'  
'you'  'need'  'to'  'make'  'the'  'point'  'but'  'it'  'gets'  'kind'

In [14]:
# Perform stemming
def perform_stemming(tokens):
    """Apply both Porter and Snowball stemmers"""
    porter_stemmer = PorterStemmer()
    snowball_stemmer = SnowballStemmer('english')
    
    porter_stems = [porter_stemmer.stem(token) for token in tokens]
    snowball_stems = [snowball_stemmer.stem(token) for token in tokens]
    
    return porter_stems, snowball_stems

if original_text:
    # Perform stemming
    porter_stemmed, snowball_stemmed = perform_stemming(cleaned_tokens)
    
    print("✂️ STEMMING RESULTS:")
    print("=" * 60)
    print(f"📊 Words processed: {len(cleaned_tokens)}")
    
    # Compare stemming results
    print(f"\n🔍 Comparison of stemming methods (first 20 words):")
    print(f"{'Original':<15} {'Porter':<15} {'Snowball':<15}")
    print("-" * 45)
    for i in range(min(20, len(cleaned_tokens))):
        original = cleaned_tokens[i]
        porter = porter_stemmed[i]
        snowball = snowball_stemmed[i]
        print(f"{original:<15} {porter:<15} {snowball:<15}")
    
    print(f"\n📝 Porter Stemmed words:")
    for i, word in enumerate(porter_stemmed):
        print(f"'{word}'", end="  ")
        if (i + 1) % 12 == 0:
            print()
    print()
    
    print(f"\n📝 Snowball Stemmed words:")
    for i, word in enumerate(snowball_stemmed):
        print(f"'{word}'", end="  ")
        if (i + 1) % 12 == 0:
            print()
    print()

✂️ STEMMING RESULTS:
📊 Words processed: 126

🔍 Comparison of stemming methods (first 20 words):
Original        Porter          Snowball       
---------------------------------------------
imagine         imagin          imagin         
each            each            each           
paragraph       paragraph       paragraph      
as              as              as             
a               a               a              
sandwich        sandwich        sandwich       
the             the             the            
real            real            real           
content         content         content        
of              of              of             
the             the             the            
sandwichthe     sandwichth      sandwichth     
meat            meat            meat           
or              or              or             
other           other           other          
fillingis       fillingi        fillingi       
in              in              in        

In [15]:
# Perform lemmatization
def perform_lemmatization(tokens):
    """Apply WordNet lemmatizer"""
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_words

if original_text:
    # Perform lemmatization
    lemmatized_words = perform_lemmatization(cleaned_tokens)
    
    print("🎯 LEMMATIZATION RESULTS:")
    print("=" * 60)
    print(f"📊 Words processed: {len(cleaned_tokens)}")
    
    # Compare original vs lemmatized
    print(f"\n🔍 Comparison of original vs lemmatized words (first 20):")
    print(f"{'Original':<15} {'Lemmatized':<15} {'Same?':<8}")
    print("-" * 38)
    same_count = 0
    for i in range(min(20, len(cleaned_tokens))):
        original = cleaned_tokens[i]
        lemmatized = lemmatized_words[i]
        is_same = "✓" if original == lemmatized else "✗"
        if original == lemmatized:
            same_count += 1
        print(f"{original:<15} {lemmatized:<15} {is_same:<8}")
    
    total_same = sum(1 for i in range(len(cleaned_tokens)) if cleaned_tokens[i] == lemmatized_words[i])
    print(f"\n📊 Words unchanged by lemmatization: {total_same}/{len(cleaned_tokens)} ({total_same/len(cleaned_tokens)*100:.1f}%)")
    
    print(f"\n📝 Lemmatized words:")
    for i, word in enumerate(lemmatized_words):
        print(f"'{word}'", end="  ")
        if (i + 1) % 12 == 0:
            print()
    print()

🎯 LEMMATIZATION RESULTS:
📊 Words processed: 126

🔍 Comparison of original vs lemmatized words (first 20):
Original        Lemmatized      Same?   
--------------------------------------
imagine         imagine         ✓       
each            each            ✓       
paragraph       paragraph       ✓       
as              a               ✗       
a               a               ✓       
sandwich        sandwich        ✓       
the             the             ✓       
real            real            ✓       
content         content         ✓       
of              of              ✓       
the             the             ✓       
sandwichthe     sandwichthe     ✓       
meat            meat            ✓       
or              or              ✓       
other           other           ✓       
fillingis       fillingis       ✓       
in              in              ✓       
the             the             ✓       
middle          middle          ✓       
it              it              ✓  

In [16]:
# Comprehensive comparison and analysis
if original_text:
    print("📊 COMPREHENSIVE ANALYSIS")
    print("=" * 80)
    
    # Create a comparison DataFrame
    comparison_data = {
        'Original': cleaned_tokens,
        'Porter_Stem': porter_stemmed,
        'Snowball_Stem': snowball_stemmed,
        'Lemmatized': lemmatized_words
    }
    
    # Convert to DataFrame for better display
    df_comparison = pd.DataFrame(comparison_data)
    print("🔍 Complete Comparison Table:")
    print(df_comparison.to_string(index=False))
    
    # Word frequency analysis
    print(f"\n📈 WORD FREQUENCY ANALYSIS:")
    print("-" * 40)
    
    # Original word frequencies
    original_freq = Counter(cleaned_tokens)
    porter_freq = Counter(porter_stemmed)
    snowball_freq = Counter(snowball_stemmed)
    lemma_freq = Counter(lemmatized_words)
    
    print(f"📊 Most frequent words (top 10):")
    print(f"\n🔤 Original words:")
    for word, count in original_freq.most_common(10):
        print(f"   '{word}': {count}")
    
    print(f"\n✂️ Porter stemmed:")
    for word, count in porter_freq.most_common(10):
        print(f"   '{word}': {count}")
    
    print(f"\n✂️ Snowball stemmed:")
    for word, count in snowball_freq.most_common(10):
        print(f"   '{word}': {count}")
    
    print(f"\n🎯 Lemmatized:")
    for word, count in lemma_freq.most_common(10):
        print(f"   '{word}': {count}")
    
    # Vocabulary reduction analysis
    print(f"\n📉 VOCABULARY REDUCTION:")
    print("-" * 30)
    print(f"Original vocabulary size: {len(set(cleaned_tokens))}")
    print(f"Porter stemmed vocabulary: {len(set(porter_stemmed))} ({len(set(porter_stemmed))/len(set(cleaned_tokens))*100:.1f}% of original)")
    print(f"Snowball stemmed vocabulary: {len(set(snowball_stemmed))} ({len(set(snowball_stemmed))/len(set(cleaned_tokens))*100:.1f}% of original)")
    print(f"Lemmatized vocabulary: {len(set(lemmatized_words))} ({len(set(lemmatized_words))/len(set(cleaned_tokens))*100:.1f}% of original)")
    
    # Summary statistics
    print(f"\n📋 SUMMARY STATISTICS:")
    print("-" * 20)
    print(f"✅ Processing completed successfully!")
    print(f"📝 Original text length: {len(original_text)} characters")
    print(f"🔤 Total tokens extracted: {len(tokens)}")
    print(f"🧹 Clean tokens after processing: {len(cleaned_tokens)}")
    print(f"📊 Unique words in original: {len(set(cleaned_tokens))}")
    print(f"⚡ Tokens removed during cleaning: {len(tokens) - len(cleaned_tokens)}")
else:
    print("❌ No text to analyze. Please check the input file.")

📊 COMPREHENSIVE ANALYSIS
🔍 Complete Comparison Table:
   Original Porter_Stem Snowball_Stem  Lemmatized
    imagine      imagin        imagin     imagine
       each        each          each        each
  paragraph   paragraph     paragraph   paragraph
         as          as            as           a
          a           a             a           a
   sandwich    sandwich      sandwich    sandwich
        the         the           the         the
       real        real          real        real
    content     content       content     content
         of          of            of          of
        the         the           the         the
sandwichthe  sandwichth    sandwichth sandwichthe
       meat        meat          meat        meat
         or          or            or          or
      other       other         other       other
  fillingis    fillingi      fillingi   fillingis
         in          in            in          in
        the         the           the         