#Part 1 Samplig

In [None]:
# imports
import pandas as pd
import random
from pathlib import Path

In [None]:
# Set random seed so everyone gets the same results
# This is important for reproducible research!
random.seed(42)

In [None]:
# STEP 1: Building our text collection
# in real research, you'd download these from Project Gutenberg or similar sources
# Tip: inspect your sources before starting https://www.gutenberg.org/cache/epub/2701/pg2701.txt

# Each text is a dictionary with metadata we care about
# This structure helps us organize and filter our collection later
texts = [
    {
        "id": "pg2701",                    # Unique identifier from source
        "title": "Moby Dick",              # Human-readable title
        "author": "Herman Melville",       # Author for filtering/grouping
        "year": 1851,                      # Publication year
        "chapters": [                      # The actual text content (simplified)
            "Call me Ishmael. Some years ago—never mind how long precisely...",
            "The Carpet-Bag. I stuffed a shirt or two into my old carpet-bag...",
            "The Spouter-Inn. Entering that gable-ended Spouter-Inn...",
            "The Counterpane. Upon waking next morning about daylight..."
        ],
        "word_count": 200000               # Metadata for analysis
    },
    {
        "id": "pg1342",
        "title": "Pride and Prejudice",
        "author": "Jane Austen",
        "year": 1813,
        "chapters": [
            "It is a truth universally acknowledged, that a single man...",
            "When Jane and Elizabeth were alone, the former, who had been...",
            "Not all that Mrs. Bennet, however, with the assistance of her..."
        ],
        "word_count": 120000
    },
    {
        "id": "pg11",
        "title": "Alice in Wonderland",
        "author": "Lewis Carroll",
        "year": 1865,
        "chapters": [
            "Down the Rabbit-Hole. Alice was beginning to get very tired...",
            "The Pool of Tears. 'Curiouser and curiouser!' cried Alice...",
            "A Caucus-Race and a Long Tale. They were indeed a queer-looking party..."
        ],
        "word_count": 27000
    }
]

# Convert to pandas DataFrame - this makes data easier to work with
# Think of it like an Excel spreadsheet you can manipulate with code
df = pd.DataFrame(texts)
df

Unnamed: 0,id,title,author,year,chapters,word_count
0,pg2701,Moby Dick,Herman Melville,1851,[Call me Ishmael. Some years ago—never mind ho...,200000
1,pg1342,Pride and Prejudice,Jane Austen,1813,"[It is a truth universally acknowledged, that ...",120000
2,pg11,Alice in Wonderland,Lewis Carroll,1865,[Down the Rabbit-Hole. Alice was beginning to ...,27000


In [None]:
# STEP 2: Creating detailed inventory (manifest)
# We need to 'flatten' our data - one row per chapter instead of per book
# This gives us more sampling options later

# Create a list to store all our chapter-level data
chapters = []

# Loop through each text in our collection
for text in texts:
    # For each text, loop through its chapters
    for i, chapter_text in enumerate(text['chapters']):
        # Create a new record for each chapter
        chapter_record = {
            'text_id': text['id'],                           # Which book this chapter comes from
            'chapter_id': f"{text['id']}_ch{i+1:02d}",      # Unique ID for this specific chapter
            'title': text['title'],                          # Book title (repeated for each chapter)
            'author': text['author'],                        # Author (repeated for each chapter)
            'chapter_num': i + 1,                           # Chapter number (1, 2, 3...)
            'chapter_text': chapter_text,                    # The actual text content
            'text_length': len(chapter_text)                # Length in characters
        }
        chapters.append(chapter_record)

# Convert to DataFrame for easy manipulation
chapter_df = pd.DataFrame(chapters)
chapter_df

Unnamed: 0,text_id,chapter_id,title,author,chapter_num,chapter_text,text_length
0,pg2701,pg2701_ch01,Moby Dick,Herman Melville,1,Call me Ishmael. Some years ago—never mind how...,64
1,pg2701,pg2701_ch02,Moby Dick,Herman Melville,2,The Carpet-Bag. I stuffed a shirt or two into ...,66
2,pg2701,pg2701_ch03,Moby Dick,Herman Melville,3,The Spouter-Inn. Entering that gable-ended Spo...,57
3,pg2701,pg2701_ch04,Moby Dick,Herman Melville,4,The Counterpane. Upon waking next morning abou...,59
4,pg1342,pg1342_ch01,Pride and Prejudice,Jane Austen,1,"It is a truth universally acknowledged, that a...",60
5,pg1342,pg1342_ch02,Pride and Prejudice,Jane Austen,2,"When Jane and Elizabeth were alone, the former...",63
6,pg1342,pg1342_ch03,Pride and Prejudice,Jane Austen,3,"Not all that Mrs. Bennet, however, with the as...",64
7,pg11,pg11_ch01,Alice in Wonderland,Lewis Carroll,1,Down the Rabbit-Hole. Alice was beginning to g...,62
8,pg11,pg11_ch02,Alice in Wonderland,Lewis Carroll,2,The Pool of Tears. 'Curiouser and curiouser!' ...,60
9,pg11,pg11_ch03,Alice in Wonderland,Lewis Carroll,3,A Caucus-Race and a Long Tale. They were indee...,72


In [None]:
# STEP 3: Learning different sampling strategies
# Each strategy answers different research questions

def random_sample(items, n):
    # random.sample() picks n items without replacement (no duplicates)
    return random.sample(items, min(n, len(items)))

In [None]:
def stratified_sample(df, n_per_group, group_col='text_id'):
    sampled_indices = []  # Keep track of which rows we selected

    # Get all unique values in the grouping column
    for group in df[group_col].unique():
        # Filter to just this group's data
        group_data = df[df[group_col] == group]

        # Don't try to sample more than we have
        n_sample = min(n_per_group, len(group_data))

        # Randomly pick n_sample rows from this group
        sample_indices = random.sample(list(group_data.index), n_sample)
        sampled_indices.extend(sample_indices)

    # Return the selected rows
    return df.loc[sampled_indices]

In [None]:
# STEP 4: Trying out each sampling method
# Let's see what each strategy actually gives us

# Convert DataFrame to list of dictionaries for easier handling
all_chapters = chapter_df.to_dict('records')
print(f"We have {len(all_chapters)} total chapters to sample from")

We have 10 total chapters to sample from


In [None]:
print("\n" + "="*60)
print("RANDOM SAMPLING: Pick 3 chapters at random")
print("="*60)
random_chapters = random_sample(all_chapters, 3)
print("Results:")
for i, ch in enumerate(random_chapters, 1):
    print(f"   {i}. {ch['title']} - Chapter {ch['chapter_num']}")
    print(f"      Preview: {ch['chapter_text'][:50]}...")



RANDOM SAMPLING: Pick 3 chapters at random
Results:
   1. Moby Dick - Chapter 2
      Preview: The Carpet-Bag. I stuffed a shirt or two into my o...
   2. Alice in Wonderland - Chapter 2
      Preview: The Pool of Tears. 'Curiouser and curiouser!' crie...
   3. Alice in Wonderland - Chapter 3
      Preview: A Caucus-Race and a Long Tale. They were indeed a ...


In [None]:
print("\n" + "="*60)
print("STRATIFIED SAMPLING: 1 chapter from each book")
print("="*60)
stratified_result = stratified_sample(chapter_df, 1)
print("Results:")
for i, (_, row) in enumerate(stratified_result.iterrows(), 1):
    print(f"   {i}. {row['title']} - Chapter {row['chapter_num']}")
    print(f"      Preview: {row['chapter_text'][:50]}...")



STRATIFIED SAMPLING: 1 chapter from each book
Results:
   1. Moby Dick - Chapter 4
      Preview: The Counterpane. Upon waking next morning about da...
   2. Pride and Prejudice - Chapter 1
      Preview: It is a truth universally acknowledged, that a sin...
   3. Alice in Wonderland - Chapter 1
      Preview: Down the Rabbit-Hole. Alice was beginning to get v...


In [None]:
# STEP 5: Saving your work (reproducibility!)
# Create a folder for our output files
output_dir = Path("samples")
output_dir.mkdir(exist_ok=True)  # Create folder if it doesn't exist

# Save each sample to a CSV file
# CSV = Comma Separated Values, readable by Excel, R, etc.

# Convert our random sample back to a DataFrame and save
random_df = pd.DataFrame(random_chapters)
random_file = output_dir / "random_sample.csv"
random_df.to_csv(random_file, index=False)  # index=False means don't save row numbers

# Save systematic sample
systematic_df = pd.DataFrame(systematic_chapters)
systematic_file = output_dir / "systematic_sample.csv"
systematic_df.to_csv(systematic_file, index=False)

# Save stratified sample (already a DataFrame)
stratified_file = output_dir / "stratified_sample.csv"
stratified_result.to_csv(stratified_file, index=False)

In [None]:
# STEP 6: Analyzing our sampling results
# Let's compare what each method gave us

print(f"\n Sample sizes:")
print(f" Random sample: {len(random_chapters)} chapters")
print(f" Systematic sample: {len(systematic_chapters)} chapters")
print(f" Stratified sample: {len(stratified_result)} chapters")

print(f"\n Author representation in random sample:")
# Count how many chapters each author got in our random sample
random_df = pd.DataFrame(random_chapters)
author_counts = random_df['author'].value_counts()
print(author_counts)
print("Notice: Random sampling might not give equal representation!")

print(f"\n Book representation in systematic sample:")
systematic_df = pd.DataFrame(systematic_chapters)
book_counts = systematic_df['title'].value_counts()
print(book_counts)
print("Notice: Systematic sampling depends on how your data is ordered!")

print(f"\n Book representation in stratified sample:")
stratified_counts = stratified_result['title'].value_counts()
print(stratified_counts)
print("Notice: Stratified sampling guarantees equal representation!")



 Sample sizes:
 Random sample: 3 chapters
 Systematic sample: 5 chapters
 Stratified sample: 3 chapters

 Author representation in random sample:
author
Herman Melville    2
Jane Austen        1
Name: count, dtype: int64
Notice: Random sampling might not give equal representation!

 Book representation in systematic sample:
title
Moby Dick              2
Pride and Prejudice    2
Alice in Wonderland    1
Name: count, dtype: int64
Notice: Systematic sampling depends on how your data is ordered!

 Book representation in stratified sample:
title
Moby Dick              1
Pride and Prejudice    1
Alice in Wonderland    1
Name: count, dtype: int64
Notice: Stratified sampling guarantees equal representation!


## Chunk chapters to be comparable units

In [None]:
import re

In [None]:
def simple_word_tokenize(text):
    # Lightweight regex tokenizer: splits on word characters and keeps apostrophes inside words.
    # words with optional internal apostrophes (e.g., don't, Ishmael's)
    return re.findall(r"[A-Za-z0-9]+(?:'[A-Za-z0-9]+)?", text)


In [None]:
def chunk_tokens(tokens, chunk_size=200, drop_last=True):
    # Break a list of tokens into fixed-size chunks.
    # - drop_last=True will discard the final short chunk so all chunks are exactly chunk_size long.
    n_full = len(tokens) // chunk_size
    end = n_full * chunk_size if drop_last else len(tokens)
    return [tokens[i:i+chunk_size] for i in range(0, end, chunk_size)]


In [None]:
def chunk_chapters_equal_token_length(chapters_data, chunk_size=200, drop_last=True):
    # Tokenize each chapter and split into fixed-size token chunks so all chunks are equal length.
    # Returns a list of dicts with metadata for analysis or later modeling.
    all_chunks = []
    for chap_record in chapters_data:
        chap_idx = chap_record['chapter_num']
        chap_text = chap_record['chapter_text']
        # Swap in nltk's word_tokenize(chap_text) if you prefer:
        # tokens = word_tokenize(chap_text)
        tokens = simple_word_tokenize(chap_text)
        chunks = chunk_tokens(tokens, chunk_size=chunk_size, drop_last=drop_last)
        for j, tok_chunk in enumerate(chunks):
            all_chunks.append({
                "chapter_id": chap_idx,
                "chunk_id": j + 1,
                "token_count": len(tok_chunk),  # will be == chunk_size if drop_last=True
                "text": " ".join(tok_chunk),
                "text_id": chap_record['text_id'],
                "title": chap_record['title'],
                "author": chap_record['author'],
                "original_chapter_length": chap_record['text_length']
            })
    return all_chunks

In [None]:
# ---- Example usage with your Moby Dick chapters ----
CHUNK_SIZE = 10  # set the common token length you want for all chunks
moby_dick_chapters = [chap for chap in all_chapters if chap['title'] == 'Moby Dick']
moby_chunks = chunk_chapters_equal_token_length(moby_dick_chapters, chunk_size=CHUNK_SIZE, drop_last=True)

In [None]:
# Optional sanity checks / quick prints
print(f"Total chunks created: {len(moby_chunks)}")
print("First chunk metadata:", {k:v for k,v in moby_chunks[0].items() if k!='text'})
print("First 200-token chunk preview:", moby_chunks[0]["text"][:300], "...")
# You can convert to a DataFrame if you want tabular analysis:
# import pandas as pd
# moby_df = pd.DataFrame(moby_chunks)
# moby_df.head()

Total chunks created: 2
First chunk metadata: {'chapter_id': 1, 'chunk_id': 1, 'token_count': 10, 'text_id': 'pg2701', 'title': 'Moby Dick', 'author': 'Herman Melville', 'original_chapter_length': 64}
First 200-token chunk preview: Call me Ishmael Some years ago never mind how long ...


#Part 2: Handling PDfs

In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.4


In [None]:
#imports

import pymupdf
import re
from pathlib import Path
import pandas as pd
from collections import Counter

In [None]:
# STEP 1: Load and extract text from actual PDFs

def extract_text_from_pdf(pdf_path):
    print(f"Opening PDF: {pdf_path}")

    try:
        # Open the PDF document using PyMuPDF
        doc = pymupdf.open(pdf_path)
        pages_data = []

        # Process each page in the document
        for page_num in range(len(doc)):
            # Load the current page
            page = doc.load_page(page_num)

            # Extract plain text from the page
            text = page.get_text()

            # Create page data dictionary with text and statistics
            page_data = {
                'page': page_num + 1,           # Convert to 1-indexed page number
                'text': text,                   # Raw extracted text
                'char_count': len(text),        # Total character count including whitespace
                'word_count': len(text.split()) # Simple word count by splitting on whitespace
            }

            pages_data.append(page_data)

            # Log progress for current page
            print(f"   Page {page_num + 1}: {len(text)} characters, {len(text.split())} words")

        # Clean up: close the document to free memory
        doc.close()

        # Log successful completion
        print(f"✓ Successfully extracted {len(pages_data)} pages")
        return pages_data

    except FileNotFoundError:
        print(f"✗ Error: PDF file not found at path: {pdf_path}")
        return None
    except PermissionError:
        print(f"✗ Error: Permission denied when trying to read: {pdf_path}")
        return None
    except Exception as e:
        print(f"✗ Error reading PDF: {e}")
        return None

In [None]:
def show_text_preview(text, title, max_chars=300):
    print(f"\n {title} Preview:")
    print("-" * 50)

    # Truncate text and remove line breaks for clean display
    preview = text[:max_chars].replace('\n', ' ')
    print(f"{preview}...")

    # Show total character count
    print(f"Total length: {len(text)} characters")
    print("-" * 50)

In [None]:
# Load the PDFs (adjust paths as needed)
# UVM Electrical Engineering document
uvm_pdf_path = "/content/electricalengineering.pdf"  # Your academic catalog
try:
    uvm_pages = extract_text_from_pdf(uvm_pdf_path)
    if uvm_pages:
        uvm_text = "\n".join([page['text'] for page in uvm_pages])
        show_text_preview(uvm_text, "UVM Electrical Engineering Catalog")
except:
    print(f" Could not load {uvm_pdf_path} - make sure file exists in current directory")
    uvm_text = None


Opening PDF: /content/electricalengineering.pdf
   Page 1: 5145 characters, 667 words
✓ Successfully extracted 1 pages

 UVM Electrical Engineering Catalog Preview:
--------------------------------------------------
THE UNIVERSITY OF VERMONT ELECTRICAL ENGINEERING ELECTRICAL ENGINEERING http://www.uvm.edu/~cems/soe/ OVERVIEW The Electrical Engineering (EE) program at the University of Vermont is at the forefront of research in the areas of digital signal processing, control systems, power and energy systems, wi...
Total length: 5145 characters
--------------------------------------------------


In [None]:
# STEP 2: Analyze what PyMuPDF extracted

# Let's see what issues PyMuPDF extraction reveals...

def analyze_extraction_quality(text, doc_name):
    if not text:
        print(f" No text to analyze for {doc_name}")
        return None

    issues = {}
    lines = text.split('\n')

    # Issue 1: Empty or very short lines (formatting artifacts)
    empty_lines = [line for line in lines if len(line.strip()) == 0]
    short_lines = [line for line in lines if 0 < len(line.strip()) < 3]
    issues['empty_lines'] = len(empty_lines)
    issues['short_lines'] = len(short_lines)

    # Issue 2: Lines that are all uppercase (likely headers)
    uppercase_lines = [line for line in lines if line.isupper() and len(line.strip()) > 3]
    issues['uppercase_lines'] = len(uppercase_lines)

    # Issue 3: Lines with mostly numbers (dates, page numbers, etc.)
    number_heavy_lines = [line for line in lines if
                         len(re.findall(r'\d', line)) / max(len(line), 1) > 0.3
                         and len(line.strip()) > 2]
    issues['number_heavy_lines'] = len(number_heavy_lines)

    # Issue 4: Potential OCR errors (mixed numbers and letters)
    mixed_chars = re.findall(r'\w*\d[A-Za-z]\w*|\w*[A-Za-z]\d\w*', text)
    issues['mixed_chars'] = len(set(mixed_chars))  # Unique instances

    # Issue 5: Very long lines (might be unwrapped text)
    long_lines = [line for line in lines if len(line) > 200]
    issues['very_long_lines'] = len(long_lines)

    print(f"\n {doc_name} Extraction Analysis:")
    for issue, count in issues.items():
        print(f"   • {issue.replace('_', ' ').title()}: {count}")

    # Show actual examples of problematic content
    print(f"\n Examples of issues found in {doc_name}:")

    if short_lines:
        print(f"  Short lines (first 5):")
        for i, line in enumerate(short_lines[:5]):
            print(f"      {i+1}. '{line.strip()}'")

    if uppercase_lines:
        print(f"  Uppercase lines (first 3):")
        for i, line in enumerate(uppercase_lines[:3]):
            print(f"      {i+1}. '{line.strip()}'")

    if number_heavy_lines:
        print(f" Number-heavy lines (first 3):")
        for i, line in enumerate(number_heavy_lines[:3]):
            print(f"      {i+1}. '{line.strip()}'")

    if mixed_chars:
        print(f" Mixed character words (first 5):")
        unique_mixed = list(set(mixed_chars))[:5]
        for i, word in enumerate(unique_mixed):
            print(f"      {i+1}. '{word}'")

    if long_lines:
        print(f"  Very long lines (first 2, truncated):")
        for i, line in enumerate(long_lines[:2]):
            truncated = line.strip()[:100] + "..." if len(line.strip()) > 100 else line.strip()
            print(f"      {i+1}. '{truncated}'")

    # Store the actual problematic content for further inspection
    issues_with_content = {
        'empty_lines': empty_lines,
        'short_lines': short_lines,
        'uppercase_lines': uppercase_lines,
        'number_heavy_lines': number_heavy_lines,
        'mixed_chars': mixed_chars,
        'long_lines': long_lines
    }

    return issues, issues_with_content


In [None]:
# Analyze both documents if they loaded
if uvm_text:
    uvm_issues, uvm_content = analyze_extraction_quality(uvm_text, "UVM Document")



 UVM Document Extraction Analysis:
   • Empty Lines: 1
   • Short Lines: 1
   • Uppercase Lines: 9
   • Number Heavy Lines: 3
   • Mixed Chars: 0
   • Very Long Lines: 0

 Examples of issues found in UVM Document:
  Short lines (first 5):
      1. '1'
  Uppercase lines (first 3):
      1. 'THE UNIVERSITY OF VERMONT'
      2. 'ELECTRICAL ENGINEERING'
      3. 'ELECTRICAL ENGINEERING'
 Number-heavy lines (first 3):
      1. 'CS 5220.'
      2. 'EE 5410.'
      3. 'EE 5610, CS 5610.'


In [None]:
# Deep dive into specific issues
if uvm_text and 'uvm_content' in locals():
    print("\n DETAILED INSPECTION:")

    # Look at all upper case lines
    if uvm_content['uppercase_lines']:
        print(f"\n upper case lines:")
        for line in uvm_content['uppercase_lines']:
            print(f"   '{line.strip()}'")


 DETAILED INSPECTION:

 upper case lines:
   'THE UNIVERSITY OF VERMONT'
   'ELECTRICAL ENGINEERING'
   'ELECTRICAL ENGINEERING'
   'OVERVIEW'
   'DEGREES'
   'FACULTY'
   'CS 5220.'
   'EE 5410.'
   'EE 5610, CS 5610.'


In [None]:
# STEP 3: Advanced PyMuPDF extraction with layout info
# PyMuPDF can give us much more than just text...we can also get information about layout
# Layout can give us a lot of clues about text which can help pull out desired data

def extract_with_layout(pdf_path, page_num=0):
    try:
        doc = pymupdf.open(pdf_path)
        page = doc.load_page(page_num)

        # Extract text with detailed formatting as dictionary structure
        blocks = page.get_text("dict")

        formatted_elements = []

        # Navigate the hierarchical structure: blocks -> lines -> spans
        for block in blocks["blocks"]:
            if "lines" in block:  # Skip image blocks, process only text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        # Create element with text content and formatting metadata
                        element = {
                            'text': span['text'],
                            'bbox': span['bbox'],    # Bounding box coordinates
                            'font': span['font'],    # Font family name
                            'size': span['size'],    # Font size in points
                            'flags': span['flags']   # Style flags (bold=16, italic=2, etc.)
                        }
                        formatted_elements.append(element)

        doc.close()
        return formatted_elements

    except Exception as e:
        print(f" Error extracting layout: {e}")
        return None

In [None]:
def analyze_document_structure(elements):
    if not elements:
        return

    # Group by font size to identify headers
    font_sizes = [elem['size'] for elem in elements]
    size_counts = Counter(font_sizes)

    print(" Font size distribution:")
    for size, count in sorted(size_counts.items(), reverse=True):
        print(f"   Size {size:.1f}: {count} elements")

    # Find likely headers (larger font sizes)
    avg_size = sum(font_sizes) / len(font_sizes)
    headers = [elem for elem in elements if elem['size'] > avg_size * 1.2]

    print(f"\n Likely headers ({len(headers)} found):")
    for header in headers[:5]:  # Show first 5
        text_preview = header['text'][:50].replace('\n', ' ')
        print(f"   '{text_preview}' (size: {header['size']:.1f})")

    # Identify fonts used
    fonts = set(elem['font'] for elem in elements)
    print(f"\n Fonts detected: {len(fonts)}")
    for font in sorted(fonts)[:5]:  # Show first 5
        print(f"   {font}")

In [None]:
# Try advanced extraction on first document
if uvm_text:
    print("Analyzing UVM document structure...")
    uvm_layout = extract_with_layout(uvm_pdf_path, 0)
    if uvm_layout:
        analyze_document_structure(uvm_layout)


Analyzing UVM document structure...
 Font size distribution:
   Size 14.0: 1 elements
   Size 12.0: 5 elements
   Size 10.0: 103 elements
   Size 8.0: 1 elements

 Likely headers (1 found):
   'ELECTRICAL ENGINEERING' (size: 14.0)

 Fonts detected: 4
   ArnoPro-Bold
   ArnoPro-Regular
   MyriadPro-Regular
   MyriadPro-SemiboldSemiEx


In [None]:
# STEP 4: Smart text cleaning based on document type

def detect_document_type(text):
    if not text:
        return "unknown"

    text_lower = text.lower()

    # Academic document indicators
    academic_keywords = ['university', 'course', 'credit', 'prerequisite', 'professor', 'department']
    academic_score = sum(1 for keyword in academic_keywords if keyword in text_lower)

    # Legal document indicators
    legal_keywords = ['county', 'state of', 'notary', 'acknowledged', 'sworn', 'witness']
    legal_score = sum(1 for keyword in legal_keywords if keyword in text_lower)

    if academic_score > legal_score:
        return "academic"
    elif legal_score > academic_score:
        return "legal"
    else:
        return "general"



In [None]:
def clean_academic_document(text):
    print(" Applying academic document cleaning...")

    # Remove URLs
    text = re.sub(r'http[s]?://[^\s]+', '[URL]', text)

    # Standardize course codes (CMPE 5220. -> CMPE 5220:)
    text = re.sub(r'([A-Z]{2,4}\s+\d{4})\.\s*', r'\1: ', text)

    # Clean up credit formatting
    text = re.sub(r'(\d+)\s+Credits?\.\s*', r'\1 Credits. ', text)

    # Fix broken faculty names (often split across lines)
    # "Smith, John; Professor" format
    text = re.sub(r'([A-Z][a-z]+),\s*([A-Z][a-z]+);\s*', r'\1, \2; ', text)

    # Remove excessive whitespace but preserve structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    text = re.sub(r' +', ' ', text)

    return text.strip()

In [None]:
def clean_legal_document(text):
    print(" Applying legal document cleaning...")

    # Standardize legal formatting
    text = re.sub(r'STATE\s+OF\s+([A-Z]+)\s*\)', r'STATE OF \1)', text)
    text = re.sub(r'\)\s*\n\s*\)\s*ss:', r')\n) ss:', text)

    # Fix date formatting
    text = re.sub(r'(\d{1,2})\w*\s+day\s+of\s+([A-Z][a-z]+)\s*,?\s*(\d{4})',
                  r'\1 day of \2, \3', text)

    # Clean up signature formatting
    text = re.sub(r'\[Signature\]', '[SIGNATURE]', text)

    # Fix page references
    text = re.sub(r'Page\s+(\d+)\s+of\s+(\d+)', r'Page \1 of \2', text)

    return text.strip()



In [None]:
# Apply document-specific cleaning
documents_to_clean = []

if uvm_text:
    uvm_type = detect_document_type(uvm_text)
    print(f"\nUVM document detected as: {uvm_type}")

    if uvm_type == "academic":
        cleaned_uvm = clean_academic_document(uvm_text)
    else:
        cleaned_uvm = uvm_text

    documents_to_clean.append(("UVM", uvm_text, cleaned_uvm))




UVM document detected as: academic
 Applying academic document cleaning...


WARNING: In this section I am showing areas where you can add errors in by the cleaning choices you make. The point is be thoughtful about generalizing one error you find in one space to the entire document.

In [None]:
# STEP 5: OCR error detection and correction or is this error inducing?

#what is wrong with this approach?

def find_ocr_errors(text):
    errors = {}

    # Common OCR character confusions
    char_errors = []
    char_errors.extend(re.findall(r'\bl[A-Z]', text))  # l instead of I
    char_errors.extend(re.findall(r'\b0[A-Za-z]', text))  # 0 instead of O
    char_errors.extend(re.findall(r'rn([a-z])', text))  # rn instead of m
    errors['character_substitutions'] = len(char_errors)

    # Broken words (space in middle)
    broken_words = re.findall(r'\b[A-Za-z]{1,2}\s+[a-z]{2,}\b', text)
    errors['broken_words'] = len(broken_words)

    # Numbers in words where they shouldn't be
    number_in_words = re.findall(r'[A-Za-z]+\d+[A-Za-z]*|\d+[A-Za-z]+', text)
    # Filter out valid cases like "5220" or "2011"
    suspicious_numbers = [w for w in number_in_words if not w.isdigit()]
    errors['numbers_in_words'] = len(suspicious_numbers)

    return errors, char_errors, broken_words, suspicious_numbers

In [None]:
def fix_ocr_errors(text):
    print(" Fixing OCR errors...")

    # Character substitution fixes
    fixes = [
        # Protect common abbreviations first
        (r'\bPHD\b', 'PhD'),
        (r'\bDSC\b', 'DSc'),

        # Fix l/I confusion (careful with word boundaries)
        (r'\bl([A-Z][a-z])', r'I\1'),

        # Fix 0/O confusion
        (r'\b0([A-Za-z])', r'O\1'),
        (r'([a-z])0\b', r'\1o'),

        # Fix rn/m confusion
        (r'rn([a-z])', r'm\1'),
        (r'([a-z])rn\b', r'\1m'),

        # Fix obvious broken words
        (r'\bU niversity\b', 'University'),
        (r'\bE ngineering\b', 'Engineering'),
        (r'\bD epartment\b', 'Department'),
    ]

    for pattern, replacement in fixes:
        before_count = len(re.findall(pattern, text))
        text = re.sub(pattern, replacement, text)
        after_count = len(re.findall(pattern, text))
        if before_count > 0:
            print(f"   Fixed {before_count} instances of '{pattern}' pattern")

    return text

In [None]:
# Apply OCR fixes to cleaned documents
for doc_name, original, cleaned in documents_to_clean:
    print(f"\n Processing {doc_name} document...")

    # Find errors in cleaned version
    errors, char_errs, broken, suspicious = find_ocr_errors(cleaned)

    print(f"Errors found in {doc_name}:")
    for error_type, count in errors.items():
        print(f"   • {error_type.replace('_', ' ').title()}: {count}")

    if char_errs:
        print(f"   Character errors examples: {char_errs[:3]}")
    if suspicious:
        print(f"   Suspicious words: {suspicious[:3]}")

    # Apply fixes
    fixed_text = fix_ocr_errors(cleaned)

    # Update our document list with fixed version
    if doc_name == "UVM":
        fixed_uvm = fixed_text


 Processing UVM document...
Errors found in UVM:
   • Character Substitutions: 3
   • Broken Words: 39
   • Numbers In Words: 0
   Character errors examples: ['e', 's', 'i']
 Fixing OCR errors...
   Fixed 10 instances of '\bPHD\b' pattern
   Fixed 1 instances of '\bDSC\b' pattern
   Fixed 3 instances of 'rn([a-z])' pattern
   Fixed 1 instances of '([a-z])rn\b' pattern


In [None]:
# STEP 6: Quality assessment and comparison


def compare_versions(original, cleaned, fixed, doc_name):

    print(f"\n {doc_name} Processing Results:")
    print(f"   Original length: {len(original)} characters")
    print(f"   After cleaning:  {len(cleaned)} characters ({len(cleaned)-len(original):+d})")
    print(f"   After OCR fixes: {len(fixed)} characters ({len(fixed)-len(cleaned):+d})")

    # Count remaining potential issues
    remaining_errors, _, _, _ = find_ocr_errors(fixed)
    total_remaining = sum(remaining_errors.values())
    print(f"   Remaining potential errors: {total_remaining}")

    return {
        'original_len': len(original),
        'cleaned_len': len(cleaned),
        'fixed_len': len(fixed),
        'remaining_errors': total_remaining
    }

In [None]:
# Compare all versions
results = {}
for doc_name, original, cleaned in documents_to_clean:
    if doc_name == "UVM" and 'fixed_uvm' in locals():
        results[doc_name] = compare_versions(original, cleaned, fixed_uvm, doc_name)

In [None]:
# STEP 7: Save processed documents

# Create output directory
output_dir = Path("processed_pdfs")
output_dir.mkdir(exist_ok=True)

saved_files = []

# Save processed documents
if 'fixed_uvm' in locals():
    uvm_output = output_dir / "uvm_electrical_engineering_processed.txt"
    with open(uvm_output, 'w', encoding='utf-8') as f:
        f.write(fixed_uvm)
    saved_files.append(uvm_output)
    print(f" Saved: {uvm_output}")

# Save processing summary
summary_data = []
for doc_name, stats in results.items():
    summary_data.append({
        'document': doc_name,
        'original_length': stats['original_len'],
        'final_length': stats['fixed_len'],
        'change': stats['fixed_len'] - stats['original_len'],
        'remaining_errors': stats['remaining_errors']
    })

if summary_data:
    summary_df = pd.DataFrame(summary_data)
    summary_file = output_dir / "processing_summary.csv"
    summary_df.to_csv(summary_file, index=False)
    print(f" Saved processing summary: {summary_file}")


#OCR Example 2: PDF to PNG and comparing OCR tools

In [None]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
#import

import pymupdf
from PIL import Image
import pytesseract
from pathlib import Path
import io

In [None]:
# STEP 1: When do you need PDF to PNG conversion?
print("\n STEP 1: When do you need PDF → PNG conversion?")
print("You need this when:")
print("   1. PDF has NO extractable text (scanned documents)")
print("   2. PyMuPDF extracts gibberish or empty text")
print("   3. Document is an image embedded in PDF")
print("   4. You want better OCR quality than embedded text")


 STEP 1: When do you need PDF → PNG conversion?
You need this when:
   1. PDF has NO extractable text (scanned documents)
   2. PyMuPDF extracts gibberish or empty text
   3. Document is an image embedded in PDF
   4. You want better OCR quality than embedded text


In [None]:
# STEP 2: Check if PDF has extractable text

def check_pdf_text_extractable(pdf_path):
    try:
        doc = pymupdf.open(pdf_path)
        page = doc.load_page(0)  # Check first page
        text = page.get_text().strip()
        doc.close()

        if len(text) > 50:  # Arbitrary threshold
            print(f" PDF has extractable text ({len(text)} characters)")
            print(f"   Preview: '{text[:100]}...'")
            return True
        else:
            print(f" PDF has little/no extractable text ({len(text)} characters)")
            return False

    except Exception as e:
        print(f" Error checking PDF: {e}")
        return False

# Test with your PDF
pdf_path = "/content/electricalengineering.pdf"  # Your scanned/image PDF

print(f"\n Checking {pdf_path}...")
has_text = check_pdf_text_extractable(pdf_path)

if has_text:
    print("   → You probably DON'T need PNG conversion")
else:
    print("   → You NEED PNG conversion for OCR")


 Checking /content/electricalengineering.pdf...
 Error checking PDF: no such file: '/content/electricalengineering.pdf'
   → You NEED PNG conversion for OCR


In [None]:
# STEP 3: Convert PDF page to PNG

def pdf_page_to_png(pdf_path, page_num=0, dpi=300):

    try:
        # Open PDF
        doc = pymupdf.open(pdf_path)
        page = doc.load_page(page_num)

        # Convert to image
        # Higher DPI = better quality but larger file
        mat = pymupdf.Matrix(dpi/72, dpi/72)  # 72 is default DPI
        pix = page.get_pixmap(matrix=mat)

        # Convert to PIL Image
        img_data = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_data))

        doc.close()

        print(f" Converted page {page_num} to {img.size[0]}x{img.size[1]} PNG")
        return img

    except Exception as e:
        print(f" Error converting PDF: {e}")
        return None

In [None]:
def save_png(img, output_path):
#Save PIL image as PNG file
    img.save(output_path, "PNG")
    print(f" Saved PNG: {output_path}")

# Convert first page to PNG
print("Converting PDF page to PNG...")

# Note: In live demo, make sure you have a scanned PDF to use
try:
    # Convert page to image
    img = pdf_page_to_png(pdf_path, page_num=0, dpi=300)

    if img:
        # Save the PNG
        png_path = "page_0.png"
        save_png(img, png_path)

        print(f" Image dimensions: {img.size[0]} x {img.size[1]} pixels")

except Exception as e:
    print(f" Could not convert PDF (make sure {pdf_path} exists)")
    print("For demo purposes, we'll simulate having a PNG image...")
    png_path = "sample_page.png"  # You'd have this from conversion


In [None]:
# STEP 4: OCR the PNG with Tesseract

def ocr_image_with_tesseract(image_path):

    try:
        # Load image
        img = Image.open(image_path)

        # Run OCR with different configurations
        # Default OCR
        text_default = pytesseract.image_to_string(img)

        # OCR with better configuration for documents
        custom_config = r'--oem 3 --psm 6'  # OCR Engine Mode 3, Page Segmentation Mode 6
        text_custom = pytesseract.image_to_string(img, config=custom_config)

        print(f" OCR completed")
        print(f"   Default config: {len(text_default)} characters")
        print(f"   Custom config:  {len(text_custom)} characters")

        # Return the better result
        return text_custom if len(text_custom) > len(text_default) else text_default

    except Exception as e:
        print(f" OCR error: {e}")
        print("   Make sure Tesseract is installed: apt-get install tesseract-ocr")
        return None

# Run OCR on the PNG
print("Running Tesseract OCR on PNG...")

try:
    ocr_text = ocr_image_with_tesseract(png_path)

    if ocr_text:
        print(f"\n OCR Results:")
        print(f"   Length: {len(ocr_text)} characters")
        print(f"   Preview: '{ocr_text[:200]}...'")

except Exception as e:
    print(f" Could not run OCR: {e}")
    print("For demo, here's what OCR might extract:")



In [None]:
# STEP 5: Compare methods and when to use each

print("\n STEP 5: Comparing extraction methods")

def compare_extraction_methods(pdf_path):
    """
    Compare direct PDF text extraction vs OCR
    """
    results = {}

    # Method 1: Direct PDF extraction
    try:
        doc = pymupdf.open(pdf_path)
        direct_text = doc[0].get_text()
        doc.close()
        results['direct'] = {
            'text': direct_text,
            'length': len(direct_text),
            'method': 'PyMuPDF direct extraction'
        }
    except:
        results['direct'] = {
            'text': '',
            'length': 0,
            'method': 'PyMuPDF direct extraction (failed)'
        }

    # Method 2: OCR (we already have this)
    results['ocr'] = {
        'text': ocr_text,
        'length': len(ocr_text) if ocr_text else 0,
        'method': 'Tesseract OCR on PNG'
    }

    return results

# Compare methods
print("Comparing extraction methods...")

try:
    comparison = compare_extraction_methods(pdf_path)

    print(f"\n Method Comparison:")
    for method, data in comparison.items():
        print(f"   {data['method']}:")
        print(f"      Length: {data['length']} characters")
        if data['length'] > 0:
            preview = data['text'][:100].replace('\n', ' ')
            print(f"      Preview: '{preview}...'")
        else:
            print(f"      Result: No text extracted")

except Exception as e:
    print(f" Could not compare methods: {e}")