<a href="https://colab.research.google.com/github/nadakhaledamohamed/AttWeb_API/blob/master/Bookindexing_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pypdf2 pdfplumber spacy
!python -m spacy download en_core_web_sm

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m3.

In [4]:
import pdfplumber
import spacy
from collections import Counter
import re

# Load English model and increase max_length to handle large documents
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2_000_000  # You can increase this further if needed

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text

def analyze_text(text, chunk_size=100_000):
    all_places = []
    all_people = []
    all_words = []

    # Split text into smaller chunks
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i + chunk_size]
        doc = nlp(chunk)

        # Extract entities
        places = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
        people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

        # Extract keywords
        words = [token.text.lower() for token in doc
                 if not token.is_stop and not token.is_punct and token.is_alpha]

        all_places.extend(places)
        all_people.extend(people)
        all_words.extend(words)

    # Count most common keywords
    keywords = Counter(all_words).most_common(50)

    return {
        "places": list(set(all_places)),
        "people": list(set(all_people)),
        "keywords": keywords
    }

# Example usage
pdf_text = extract_text_from_pdf("file.pdf")
results = analyze_text(pdf_text)

# Display results
print("Key Places:", results["places"][:20])
print("\nKey People:", results["people"][:20])
print("\nTop Keywords:", results["keywords"][:20])


Key Places: ['pp.', 'Ytalicas', 'Arzana', 'G. Pissarello', 'Greeks', 'http://comunesarule.it/f_testo_sarule_contu8', 'Marinid', 'Virardu', 'Madbooli', 'Sanna', 'Tripoli', 'Tokyo', 'Mùrsia', 'Priami', 'Şiqilliyya', 'Martin', 'dealing.36', 'Ḥassān', 'Hakkert', 'Studi Medievali']

Key People: ['Zanichelli', 'Ibn Shādhān', 'Ibn Żafar al-Şiqilli', 'Scienze Lettere', 'Al-Sharīf', 'Greeks', 'Dār al-Wafā’ Publishing', 'Ibn Bashkawāl', 'ḥatta al-ghazw al-normandi', 'al-Ṭurūs', 'al-nashāţ', 'Maktabat al-Jāmi‘a al-‘Arabiyya', 'Ingrid Bijarano', 'Gutenberg', 'Sūsa', 'Abu\nAli al-Ḥasan', 'Mùrsia', 'Koder', 'Ulama al-Qayrawān', 'Fatāwa al-Barzali']

Top Keywords: [('al', 6163), ('ibn', 2005), ('sicily', 1870), ('arabic', 878), ('op', 776), ('cit', 776), ('sicilian', 636), ('muslim', 630), ('arab', 614), ('ed', 538), ('pp', 500), ('poetry', 480), ('wa', 452), ('muslims', 418), ('fī', 406), ('islamic', 402), ('abu', 358), ('dār', 356), ('norman', 355), ('idrīsi', 349)]


In [6]:
import pdfplumber
import spacy
from collections import defaultdict, Counter
import re

# Load English model and increase max_length to handle large documents
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2_000_000  # You can increase this further if needed

def extract_text_with_page_numbers(pdf_path):
    """Extract text with page number tracking"""
    pages_content = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text()
            if page_text:
                pages_content.append({
                    "page": page_num,
                    "text": page_text
                })
    return pages_content

def analyze_text_with_page_numbers(pages_content, chunk_size=100_000):
    """Analyze text while preserving page number information"""
    word_pages = defaultdict(list)
    place_pages = defaultdict(list)
    person_pages = defaultdict(list)
    all_words = []

    for page in pages_content:
        page_num = page["page"]
        text = page["text"]

        # Process in chunks if needed (though now we're processing page by page)
        for i in range(0, len(text), chunk_size):
            chunk = text[i:i + chunk_size]
            doc = nlp(chunk)

            # Extract entities with page numbers
            for ent in doc.ents:
                if ent.label_ == "GPE":
                    place_pages[ent.text].append(page_num)
                elif ent.label_ == "PERSON":
                    person_pages[ent.text].append(page_num)

            # Extract keywords with page numbers
            for token in doc:
                if not token.is_stop and not token.is_punct and token.is_alpha:
                    word = token.text.lower()
                    word_pages[word].append(page_num)
                    all_words.append(word)

    # Count most common keywords with page numbers
    word_counts = Counter(all_words)
    keywords_with_pages = [
        (word, count, sorted(list(set(word_pages[word]))))
        for word, count in word_counts.most_common(50)
    ]

    # Prepare places and people with page numbers
    places_with_pages = [
        (place, sorted(list(set(pages))))
        for place, pages in place_pages.items()
    ]

    people_with_pages = [
        (person, sorted(list(set(pages))))
        for person, pages in person_pages.items()
    ]

    return {
        "places": sorted(places_with_pages, key=lambda x: len(x[1]), reverse=True),
        "people": sorted(people_with_pages, key=lambda x: len(x[1]), reverse=True),
        "keywords": keywords_with_pages
    }

def print_results(results, max_items=20):
    """Print results with page numbers"""
    print("Key Places (with page numbers):")
    for place, pages in results["places"][:max_items]:
        print(f"- {place} (pages: {', '.join(map(str, pages))})")

    print("\nKey People (with page numbers):")
    for person, pages in results["people"][:max_items]:
        print(f"- {person} (pages: {', '.join(map(str, pages))})")

    print("\nTop Keywords (with page numbers):")
    for word, count, pages in results["keywords"][:max_items]:
        print(f"- {word} (count: {count}, pages: {', '.join(map(str, pages))})")

# Example usage
print("Extracting text with page numbers...")
pages_content = extract_text_with_page_numbers("file.pdf")

print("\nAnalyzing text with page tracking...")
results = analyze_text_with_page_numbers(pages_content)

print("\nResults:")
print_results(results)

Extracting text with page numbers...

Analyzing text with page tracking...

Results:
Key Places (with page numbers):
- Sicily (pages: 7, 8, 9, 10, 11, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 47, 48, 49, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 88, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 119, 121, 122, 124, 125, 126, 127, 129, 131, 136, 137, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 157, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 218, 219, 220, 221, 222, 223, 225, 229, 230, 231, 232, 234, 235, 236, 237, 238, 24

In [3]:
!pip install pdfplumber


Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
import pdfplumber
import spacy
from collections import defaultdict, Counter
import re

# Load English model and increase max_length to handle large documents
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2_000_000  # You can increase this further if needed

def extract_text_with_page_numbers(pdf_path):
    """Extract text with page number tracking"""
    pages_content = []
    total_words = 0
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text()
            if page_text:
                word_count = len(page_text.split())
                total_words += word_count
                pages_content.append({
                    "page": page_num,
                    "text": page_text,
                    "word_count": word_count
                })
    return pages_content, total_words

def analyze_text_with_page_numbers(pages_content, chunk_size=100_000):
    """Analyze text while preserving page number information"""
    word_pages = defaultdict(list)
    word_counts = Counter()
    place_pages = defaultdict(list)
    place_counts = Counter()
    person_pages = defaultdict(list)
    person_counts = Counter()
    all_words = []

    for page in pages_content:
        page_num = page["page"]
        text = page["text"]

        # Process in chunks if needed
        for i in range(0, len(text), chunk_size):
            chunk = text[i:i + chunk_size]
            doc = nlp(chunk)

            # Extract entities with page numbers and counts
            for ent in doc.ents:
                if ent.label_ == "GPE":
                    place_pages[ent.text].append(page_num)
                    place_counts[ent.text] += 1
                elif ent.label_ == "PERSON":
                    person_pages[ent.text].append(page_num)
                    person_counts[ent.text] += 1

            # Extract keywords with page numbers and counts
            for token in doc:
                if not token.is_stop and not token.is_punct and token.is_alpha:
                    word = token.text.lower()
                    word_pages[word].append(page_num)
                    word_counts[word] += 1
                    all_words.append(word)

    # Prepare keywords with counts and page numbers
    keywords_with_pages = [
        (word, word_counts[word], sorted(list(set(word_pages[word]))))
        for word, _ in word_counts.most_common(50)
    ]

    # Prepare places with counts and page numbers
    places_with_pages = [
        (place, place_counts[place], sorted(list(set(pages))))
        for place, pages in place_pages.items()
    ]

    # Prepare people with counts and page numbers
    people_with_pages = [
        (person, person_counts[person], sorted(list(set(pages))))
        for person, pages in person_pages.items()
    ]

    return {
        "total_words": sum(word_counts.values()),
        "unique_words": len(word_counts),
        "places": sorted(places_with_pages, key=lambda x: x[1], reverse=True),
        "people": sorted(people_with_pages, key=lambda x: x[1], reverse=True),
        "keywords": keywords_with_pages
    }

def print_results(results, max_items=20):
    """Print results with counts and page numbers"""
    # Print summary statistics
    print(f"\nDocument Statistics:")
    print(f"- Total words processed: {results['total_words']:,}")
    print(f"- Unique words found: {results['unique_words']:,}")

    # Print places with counts
    print("\nKey Places (with counts and page numbers):")
    for place, count, pages in results["places"][:max_items]:
        print(f"- {place} (appears {count} times, pages: {', '.join(map(str, pages))})")

    # Print people with counts
    print("\nKey People (with counts and page numbers):")
    for person, count, pages in results["people"][:max_items]:
        print(f"- {person} (appears {count} times, pages: {', '.join(map(str, pages))})")

    # Print keywords with counts
    print("\nTop Keywords (with counts and page numbers):")
    for word, count, pages in results["keywords"][:max_items]:
        print(f"- {word} (appears {count} times, pages: {', '.join(map(str, pages))})")

# Example usage
print("Extracting text with page numbers...")
pages_content, total_doc_words = extract_text_with_page_numbers("file.pdf")

print("\nAnalyzing text with page tracking...")
results = analyze_text_with_page_numbers(pages_content)

print("\nResults:")
print(f"Total words in document: {total_doc_words:,}")
print_results(results)

Extracting text with page numbers...

Analyzing text with page tracking...

Results:
Total words in document: 222,274

Document Statistics:
- Total words processed: 121,577
- Unique words found: 17,467

Key Places (with counts and page numbers):
- Sicily (appears 1157 times, pages: 7, 8, 9, 10, 11, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 47, 48, 49, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 88, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 119, 121, 122, 124, 125, 126, 127, 129, 131, 136, 137, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 157, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203

In [9]:
import pdfplumber
import spacy
from collections import defaultdict, Counter
import re

# Load English model and increase max_length to handle large documents
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2_000_000  # You can increase this further if needed

def extract_text_with_page_numbers(pdf_path):
    """Extract text with page number tracking"""
    pages_content = []
    total_words = 0
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text()
            if page_text:
                word_count = len(page_text.split())
                total_words += word_count
                pages_content.append({
                    "page": page_num,
                    "text": page_text,
                    "word_count": word_count
                })
    return pages_content, total_words

def analyze_text_with_page_numbers(pages_content, chunk_size=100_000):
    """Analyze text while preserving page number information"""
    word_pages = defaultdict(list)
    word_counts = Counter()
    place_pages = defaultdict(list)
    place_counts = Counter()
    person_pages = defaultdict(list)
    person_counts = Counter()
    all_words = []

    for page in pages_content:
        page_num = page["page"]
        text = page["text"]

        # Process in chunks if needed
        for i in range(0, len(text), chunk_size):
            chunk = text[i:i + chunk_size]
            doc = nlp(chunk)

            # Extract entities with page numbers and counts
            for ent in doc.ents:
                if ent.label_ == "GPE":
                    place_pages[ent.text].append(page_num)
                    place_counts[ent.text] += 1
                elif ent.label_ == "PERSON":
                    person_pages[ent.text].append(page_num)
                    person_counts[ent.text] += 1

            # Extract keywords with page numbers and counts
            for token in doc:
                if not token.is_stop and not token.is_punct and token.is_alpha:
                    word = token.text.lower()
                    word_pages[word].append(page_num)
                    word_counts[word] += 1
                    all_words.append(word)

    # Calculate total counts for each category
    total_places = sum(place_counts.values()) or 1  # Avoid division by zero
    total_people = sum(person_counts.values()) or 1
    total_keywords = sum(word_counts.values()) or 1

    # Prepare keywords with counts, percentages, and page numbers
    keywords_with_pages = [
        (word, word_counts[word], (word_counts[word]/total_keywords)*100, sorted(list(set(word_pages[word]))))
        for word, _ in word_counts.most_common(50)
    ]

    # Prepare places with counts, percentages, and page numbers
    places_with_pages = [
        (place, place_counts[place], (place_counts[place]/total_places)*100, sorted(list(set(pages))))
        for place, pages in place_pages.items()
    ]

    # Prepare people with counts, percentages, and page numbers
    people_with_pages = [
        (person, person_counts[person], (person_counts[person]/total_people)*100, sorted(list(set(pages))))
        for person, pages in person_pages.items()
    ]

    return {
        "total_words": sum(word_counts.values()),
        "unique_words": len(word_counts),
        "places": sorted(places_with_pages, key=lambda x: x[1], reverse=True),
        "people": sorted(people_with_pages, key=lambda x: x[1], reverse=True),
        "keywords": keywords_with_pages,
        "total_places": total_places,
        "total_people": total_people,
        "total_keywords": total_keywords
    }

def print_results(results, max_items=20):
    """Print results with counts, percentages, and page numbers"""
    # Print summary statistics
    print(f"\nDocument Statistics:")
    print(f"- Total words processed: {results['total_words']:,}")
    print(f"- Unique words found: {results['unique_words']:,}")

    # Print places with counts and percentages
    print(f"\nKey Places (appear {results['total_places']} times total):")
    for place, count, percent, pages in results["places"][:max_items]:
        print(f"- {place} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

    # Print people with counts and percentages
    print(f"\nKey People (appear {results['total_people']} times total):")
    for person, count, percent, pages in results["people"][:max_items]:
        print(f"- {person} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

    # Print keywords with counts and percentages
    print(f"\nTop Keywords (appear {results['total_keywords']} times total):")
    for word, count, percent, pages in results["keywords"][:max_items]:
        print(f"- {word} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

# Example usage
print("Extracting text with page numbers...")
pages_content, total_doc_words = extract_text_with_page_numbers("file.pdf")

print("\nAnalyzing text with page tracking...")
results = analyze_text_with_page_numbers(pages_content)

print("\nResults:")
print(f"Total words in document: {total_doc_words:,}")
print_results(results)

Extracting text with page numbers...

Analyzing text with page tracking...

Results:
Total words in document: 222,274

Document Statistics:
- Total words processed: 121,577
- Unique words found: 17,467

Key Places (appear 5418 times total):
- Sicily (appears 1157 times, 21.4%, pages: 7, 8, 9, 10, 11, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 47, 48, 49, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 88, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 119, 121, 122, 124, 125, 126, 127, 129, 131, 136, 137, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 157, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 2

In [10]:
import pdfplumber
import spacy
from collections import defaultdict, Counter
import re

# Load English model and increase max_length to handle large documents
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2_000_000  # You can increase this further if needed

def extract_text_with_page_numbers(pdf_path):
    """Extract text with page number tracking"""
    pages_content = []
    total_words = 0
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text()
            if page_text:
                word_count = len(page_text.split())
                total_words += word_count
                pages_content.append({
                    "page": page_num,
                    "text": page_text,
                    "word_count": word_count
                })
    return pages_content, total_words

def analyze_text_with_page_numbers(pages_content, chunk_size=100_000):
    """Analyze text while preserving page number information"""
    word_pages = defaultdict(list)
    word_counts = Counter()
    place_pages = defaultdict(list)
    place_counts = Counter()
    person_pages = defaultdict(list)
    person_counts = Counter()
    all_words = []

    for page in pages_content:
        page_num = page["page"]
        text = page["text"]

        # Process in chunks if needed
        for i in range(0, len(text), chunk_size):
            chunk = text[i:i + chunk_size]
            doc = nlp(chunk)

            # Extract entities with page numbers and counts
            for ent in doc.ents:
                if ent.label_ == "GPE":
                    place_pages[ent.text].append(page_num)
                    place_counts[ent.text] += 1
                elif ent.label_ == "PERSON":
                    person_pages[ent.text].append(page_num)
                    person_counts[ent.text] += 1

            # Extract keywords with page numbers and counts
            for token in doc:
                if not token.is_stop and not token.is_punct and token.is_alpha:
                    word = token.text.lower()
                    word_pages[word].append(page_num)
                    word_counts[word] += 1
                    all_words.append(word)

    # Calculate total counts for each category
    total_places = sum(place_counts.values()) or 1  # Avoid division by zero
    total_people = sum(person_counts.values()) or 1
    total_keywords = sum(word_counts.values()) or 1

    # Prepare keywords with counts, percentages, and page numbers
    keywords_with_pages = [
        (word, word_counts[word], (word_counts[word]/total_keywords)*100, sorted(list(set(word_pages[word]))))
        for word, _ in word_counts.most_common(50)
    ]

    # Prepare places with counts, percentages, and page numbers
    places_with_pages = [
        (place, place_counts[place], (place_counts[place]/total_places)*100, sorted(list(set(pages))))
        for place, pages in place_pages.items()
    ]

    # Prepare people with counts, percentages, and page numbers
    people_with_pages = [
        (person, person_counts[person], (person_counts[person]/total_people)*100, sorted(list(set(pages))))
        for person, pages in person_pages.items()
    ]

    return {
        "total_words": sum(word_counts.values()),
        "unique_words": len(word_counts),
        "places": sorted(places_with_pages, key=lambda x: x[1], reverse=True),
        "people": sorted(people_with_pages, key=lambda x: x[1], reverse=True),
        "keywords": keywords_with_pages,
        "total_places": total_places,
        "total_people": total_people,
        "total_keywords": total_keywords
    }

def print_results(results, max_items=20):
    """Print results with counts, percentages, and page numbers"""
    # Calculate unique words percentage
    unique_words_percentage = (results['unique_words'] / results['total_words']) * 100

    # Print summary statistics
    print(f"\nDocument Statistics:")
    print(f"- Total words processed: {results['total_words']:,}")
    print(f"- Unique words found: {results['unique_words']:,} ({unique_words_percentage:.1f}% of total words)")

    # Print places with counts and percentages
    print(f"\nKey Places (appear {results['total_places']} times total):")
    for place, count, percent, pages in results["places"][:max_items]:
        print(f"- {place} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

    # Print people with counts and percentages
    print(f"\nKey People (appear {results['total_people']} times total):")
    for person, count, percent, pages in results["people"][:max_items]:
        print(f"- {person} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

    # Print keywords with counts and percentages
    print(f"\nTop Keywords (appear {results['total_keywords']} times total):")
    for word, count, percent, pages in results["keywords"][:max_items]:
        print(f"- {word} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

# Example usage
print("Extracting text with page numbers...")
pages_content, total_doc_words = extract_text_with_page_numbers("file.pdf")

print("\nAnalyzing text with page tracking...")
results = analyze_text_with_page_numbers(pages_content)

print("\nResults:")
print(f"Total words in document: {total_doc_words:,}")
print_results(results)


Extracting text with page numbers...

Analyzing text with page tracking...

Results:
Total words in document: 222,274

Document Statistics:
- Total words processed: 121,577
- Unique words found: 17,467 (14.4% of total words)

Key Places (appear 5418 times total):
- Sicily (appears 1157 times, 21.4%, pages: 7, 8, 9, 10, 11, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 47, 48, 49, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 88, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 119, 121, 122, 124, 125, 126, 127, 129, 131, 136, 137, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 157, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 189, 190, 191, 192, 193, 194, 195, 196, 197

In [11]:
import pdfplumber
import spacy
from collections import defaultdict, Counter
import re

# Load English model and increase max_length to handle large documents
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2_000_000  # You can increase this further if needed

def extract_text_with_page_numbers(pdf_path):
    """Extract text with page number tracking"""
    pages_content = []
    total_words = 0
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text()
            if page_text:
                word_count = len(page_text.split())
                total_words += word_count
                pages_content.append({
                    "page": page_num,
                    "text": page_text,
                    "word_count": word_count
                })
    return pages_content, total_words

def analyze_text_with_page_numbers(pages_content, chunk_size=100_000):
    """Analyze text while preserving page number information"""
    word_pages = defaultdict(list)
    word_counts = Counter()
    place_pages = defaultdict(list)
    place_counts = Counter()
    person_pages = defaultdict(list)
    person_counts = Counter()
    all_words = []
    processed_words_count = 0  # Track total words processed by spaCy

    for page in pages_content:
        page_num = page["page"]
        text = page["text"]

        # Process in chunks if needed
        for i in range(0, len(text), chunk_size):
            chunk = text[i:i + chunk_size]
            doc = nlp(chunk)

            # Count all tokens that are words (not punctuation, not spaces)
            processed_words_count += len([token for token in doc if not token.is_punct and not token.is_space])

            # Extract entities with page numbers and counts
            for ent in doc.ents:
                if ent.label_ == "GPE":
                    place_pages[ent.text].append(page_num)
                    place_counts[ent.text] += 1
                elif ent.label_ == "PERSON":
                    person_pages[ent.text].append(page_num)
                    person_counts[ent.text] += 1

            # Extract keywords with page numbers and counts
            for token in doc:
                if not token.is_stop and not token.is_punct and token.is_alpha:
                    word = token.text.lower()
                    word_pages[word].append(page_num)
                    word_counts[word] += 1
                    all_words.append(word)

    # Calculate total counts for each category
    total_places = sum(place_counts.values()) or 1  # Avoid division by zero
    total_people = sum(person_counts.values()) or 1
    total_keywords = sum(word_counts.values()) or 1

    # Prepare keywords with counts, percentages, and page numbers
    keywords_with_pages = [
        (word, word_counts[word], (word_counts[word]/total_keywords)*100, sorted(list(set(word_pages[word]))))
        for word, _ in word_counts.most_common(50)
    ]

    # Prepare places with counts, percentages, and page numbers
    places_with_pages = [
        (place, place_counts[place], (place_counts[place]/total_places)*100, sorted(list(set(pages))))
        for place, pages in place_pages.items()
    ]

    # Prepare people with counts, percentages, and page numbers
    people_with_pages = [
        (person, person_counts[person], (person_counts[person]/total_people)*100, sorted(list(set(pages))))
        for person, pages in person_pages.items()
    ]

    return {
        "total_words_in_document": sum(page["word_count"] for page in pages_content),
        "total_words_processed": processed_words_count,
        "unique_words_in_document": len(word_counts),
        "unique_words_processed": len(set(all_words)),
        "places": sorted(places_with_pages, key=lambda x: x[1], reverse=True),
        "people": sorted(people_with_pages, key=lambda x: x[1], reverse=True),
        "keywords": keywords_with_pages,
        "total_places": total_places,
        "total_people": total_people,
        "total_keywords": total_keywords
    }

def print_results(results, max_items=20):
    """Print results with counts, percentages, and page numbers"""
    # Calculate percentages
    words_processed_percentage = (results['total_words_processed'] / results['total_words_in_document']) * 100
    unique_words_percentage = (results['unique_words_processed'] / results['unique_words_in_document']) * 100 if results['unique_words_in_document'] > 0 else 0

    # Print summary statistics
    print(f"\nDocument Statistics:")
    print(f"- Total words in document: {results['total_words_in_document']:,}")
    print(f"- Total words processed: {results['total_words_processed']:,} ({words_processed_percentage:.1f}% of document)")
    print(f"- Unique words in document: {results['unique_words_in_document']:,}")
    print(f"- Unique words processed: {results['unique_words_processed']:,} ({unique_words_percentage:.1f}% of document unique words)")

    # Print places with counts and percentages
    print(f"\nKey Places (appear {results['total_places']} times total):")
    for place, count, percent, pages in results["places"][:max_items]:
        print(f"- {place} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

    # Print people with counts and percentages
    print(f"\nKey People (appear {results['total_people']} times total):")
    for person, count, percent, pages in results["people"][:max_items]:
        print(f"- {person} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

    # Print keywords with counts and percentages
    print(f"\nTop Keywords (appear {results['total_keywords']} times total):")
    for word, count, percent, pages in results["keywords"][:max_items]:
        print(f"- {word} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

# Example usage
print("Extracting text with page numbers...")
pages_content, total_doc_words = extract_text_with_page_numbers("file.pdf")

print("\nAnalyzing text with page tracking...")
results = analyze_text_with_page_numbers(pages_content)

print("\nResults:")
print_results(results)

Extracting text with page numbers...

Analyzing text with page tracking...

Results:

Document Statistics:
- Total words in document: 222,274
- Total words processed: 231,172 (104.0% of document)
- Unique words in document: 17,467
- Unique words processed: 17,467 (100.0% of document unique words)

Key Places (appear 5418 times total):
- Sicily (appears 1157 times, 21.4%, pages: 7, 8, 9, 10, 11, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 47, 48, 49, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 88, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 119, 121, 122, 124, 125, 126, 127, 129, 131, 136, 137, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 157, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 

In [12]:
import pdfplumber
import spacy
from collections import defaultdict, Counter
import re

# Load English model and increase max_length to handle large documents
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2_000_000  # You can increase this further if needed

def extract_text_with_page_numbers(pdf_path):
    """Extract text with page number tracking"""
    pages_content = []
    total_words = 0
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text()
            if page_text:
                word_count = len(page_text.split())
                total_words += word_count
                pages_content.append({
                    "page": page_num,
                    "text": page_text,
                    "word_count": word_count
                })
    return pages_content, total_words

def analyze_text_with_page_numbers(pages_content, chunk_size=100_000):
    """Analyze text while preserving page number information"""
    word_pages = defaultdict(list)
    word_counts = Counter()
    place_pages = defaultdict(list)
    place_counts = Counter()
    person_pages = defaultdict(list)
    person_counts = Counter()
    all_words = []
    processed_words_count = 0  # Track total words processed by spaCy

    for page in pages_content:
        page_num = page["page"]
        text = page["text"]

        # Process in chunks if needed
        for i in range(0, len(text), chunk_size):
            chunk = text[i:i + chunk_size]
            doc = nlp(chunk)

            # Count all tokens that are words (not punctuation, not spaces)
            processed_words_count += len([token for token in doc if not token.is_punct and not token.is_space])

            # Extract entities with page numbers and counts
            for ent in doc.ents:
                if ent.label_ == "GPE":
                    place_pages[ent.text].append(page_num)
                    place_counts[ent.text] += 1
                elif ent.label_ == "PERSON":
                    person_pages[ent.text].append(page_num)
                    person_counts[ent.text] += 1

            # Extract keywords with page numbers and counts
            for token in doc:
                if not token.is_stop and not token.is_punct and token.is_alpha:
                    word = token.text.lower()
                    word_pages[word].append(page_num)
                    word_counts[word] += 1
                    all_words.append(word)

    # Calculate total counts for each category
    total_places = sum(place_counts.values()) or 1  # Avoid division by zero
    total_people = sum(person_counts.values()) or 1
    total_keywords = sum(word_counts.values()) or 1

    # Prepare keywords with counts, percentages, and page numbers
    keywords_with_pages = [
        (word, word_counts[word], (word_counts[word]/total_keywords)*100, sorted(list(set(word_pages[word]))))
        for word, _ in word_counts.most_common(50)
    ]

    # Prepare places with counts, percentages, and page numbers
    places_with_pages = [
        (place, place_counts[place], (place_counts[place]/total_places)*100, sorted(list(set(pages))))
        for place, pages in place_pages.items()
    ]

    # Prepare people with counts, percentages, and page numbers
    people_with_pages = [
        (person, person_counts[person], (person_counts[person]/total_people)*100, sorted(list(set(pages))))
        for person, pages in person_pages.items()
    ]

    return {
        "total_words_in_document": sum(page["word_count"] for page in pages_content),
        "total_words_processed": processed_words_count,
        "unique_words_in_document": len(word_counts),
        "unique_words_processed": len(set(all_words)),
        "places": sorted(places_with_pages, key=lambda x: x[1], reverse=True),
        "people": sorted(people_with_pages, key=lambda x: x[1], reverse=True),
        "keywords": keywords_with_pages,
        "total_places": total_places,
        "total_people": total_people,
        "total_keywords": total_keywords
    }

def print_results(results, max_items=20):
    """Print results with counts, percentages, and page numbers"""
    # Calculate percentages
    words_processed_percentage = (results['total_words_processed'] / results['total_words_in_document']) * 100
    unique_words_percentage = (results['unique_words_processed'] / results['unique_words_in_document']) * 100 if results['unique_words_in_document'] > 0 else 0

    # Print summary statistics
    print(f"\nDocument Statistics:")
    print(f"- Total words in document: {results['total_words_in_document']:,}")
    print(f"- Total words processed: {results['total_words_processed']:,} ({words_processed_percentage:.1f}% of document)")
    print(f"- Unique words in document: {results['unique_words_in_document']:,}")
    print(f"- Unique words processed: {results['unique_words_processed']:,} ({unique_words_percentage:.1f}% of document unique words)")

    # Print places with counts and percentages
    print(f"\nKey Places (appear {results['total_places']} times total):")
    for place, count, percent, pages in results["places"][:max_items]:
        print(f"- {place} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

    # Print people with counts and percentages
    print(f"\nKey People (appear {results['total_people']} times total):")
    for person, count, percent, pages in results["people"][:max_items]:
        print(f"- {person} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

    # Print keywords with counts and percentages
    print(f"\nTop Keywords (appear {results['total_keywords']} times total):")
    for word, count, percent, pages in results["keywords"][:max_items]:
        print(f"- {word} (appears {count} times, {percent:.1f}%, pages: {', '.join(map(str, pages))})")

# Example usage
print("Extracting text with page numbers...")
pages_content, total_doc_words = extract_text_with_page_numbers("file.pdf")

print("\nAnalyzing text with page tracking...")
results = analyze_text_with_page_numbers(pages_content)

print("\nResults:")
print_results(results)

Extracting text with page numbers...

Analyzing text with page tracking...

Results:

Document Statistics:
- Total words in document: 222,274
- Total words processed: 231,172 (104.0% of document)
- Unique words in document: 17,467
- Unique words processed: 17,467 (100.0% of document unique words)

Key Places (appear 5418 times total):
- Sicily (appears 1157 times, 21.4%, pages: 7, 8, 9, 10, 11, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 47, 48, 49, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 88, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 119, 121, 122, 124, 125, 126, 127, 129, 131, 136, 137, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 157, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 

In [20]:
import pdfplumber
import spacy
from collections import defaultdict, Counter
import string
import time
from typing import List, Dict, Tuple, Set

# Load English model with optimized pipeline
nlp = spacy.load("en_core_web_sm", disable=["parser"])
nlp.max_length = 3_000_000  # Increased max length for large documents

def extract_text_with_page_numbers(pdf_path: str) -> Tuple[List[Dict], int]:
    """Extract text with page number tracking with progress updates"""
    pages_content = []
    total_words = 0
    print("Extracting text from PDF...")
    start_time = time.time()

    try:
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            for page_num, page in enumerate(pdf.pages, start=1):
                if page_num % 10 == 0 or page_num == total_pages:
                    print(f"Processing page {page_num}/{total_pages}...")

                page_text = page.extract_text()
                if page_text:
                    word_count = len(page_text.split())
                    total_words += word_count
                    pages_content.append({
                        "page": page_num,
                        "text": page_text,
                        "word_count": word_count
                    })

        elapsed = time.time() - start_time
        print(f"\nExtracted {total_words:,} words from {len(pages_content)} pages in {elapsed:.1f} seconds")
        return pages_content, total_words

    except Exception as e:
        print(f"\nError processing PDF: {str(e)}")
        return [], 0

def is_valid_keyword(word: str) -> bool:
    """Check if a word is valid for keyword inclusion"""
    if len(word) <= 2:
        return False

    invalid_conditions = [
        word.lower() in nlp.Defaults.stop_words,
        any(char.isdigit() for char in word),
        any(char in string.punctuation for char in word),
        not word.isascii()
    ]

    return not any(invalid_conditions)

def analyze_content(pages_content: List[Dict]) -> Dict:
    """Analyze content and categorize into places, people, and keywords"""
    print("\nAnalyzing and categorizing content...")
    start_time = time.time()

    results = {
        'places': defaultdict(list),
        'people': defaultdict(list),
        'keywords': defaultdict(list),
        'word_counts': Counter(),
        'all_words': set(),
        'processed_words': 0
    }

    try:
        for i, page in enumerate(pages_content, 1):
            page_num = page["page"]
            text = page["text"]

            if i % 10 == 0 or i == len(pages_content):
                print(f"Analyzing page {i}/{len(pages_content)}...")

            doc = nlp(text)
            results['processed_words'] += len([t for t in doc if t.is_alpha])

            # Process entities and keywords
            for ent in doc.ents:
                if ent.label_ == "GPE":  # Geographical locations
                    results['places'][ent.text.lower()].append(page_num)
                elif ent.label_ == "PERSON":  # People names
                    results['people'][ent.text.lower()].append(page_num)

            # Process keywords
            for token in doc:
                if token.is_alpha and not token.is_stop:
                    word = token.text.lower()
                    results['all_words'].add(word)
                    results['word_counts'][word] += 1
                    if is_valid_keyword(word):
                        results['keywords'][word].append(page_num)

        # Prepare final categorized results
        def prepare_category(data: Dict, min_count: int = 2) -> List[Tuple]:
            return sorted(
                [(word.capitalize(), len(pages), sorted(set(pages)))
                 for word, pages in data.items()
                 if len(pages) >= min_count],
                key=lambda x: (-x[1], x[0])
            )

        categorized = {
            'places': prepare_category(results['places']),
            'people': prepare_category(results['people']),
            'keywords': prepare_category(results['keywords'], min_count=3)[:1000],  # Top 1000 keywords
            'unique_word_count': len(results['all_words']),
            'processed_word_count': results['processed_words'],
            'total_page_count': len(pages_content)
        }

        elapsed = time.time() - start_time
        print(f"\nAnalysis completed in {elapsed:.1f} seconds")
        print(f"Found {len(categorized['places'])} places, {len(categorized['people'])} people, and {len(categorized['keywords'])} keywords")

        return categorized

    except Exception as e:
        print(f"\nError during analysis: {str(e)}")
        return {}

def print_categorized_results(results: Dict):
    """Print categorized results to the cell"""
    if not results:
        print("No results to display")
        return

    print("\n" + "="*50)
    print("CATEGORIZED RESULTS".center(50))
    print("="*50)

    print(f"\nTotal Unique Words: {results['unique_word_count']:,}")
    print(f"Total Processed Words: {results['processed_word_count']:,}")
    print(f"Total Pages: {results['total_page_count']}\n")

    def print_category(name: str, items: List[Tuple], max_items: int = 20):
        print(f"\n{name.upper()} ({len(items)} total)")
        print("-"*50)
        for i, (item, count, pages) in enumerate(items[:max_items], 1):
            print(f"{i:3}. {item:<20} (appears {count:>3} times) - Pages: {', '.join(map(str, pages))}")
        if len(items) > max_items:
            print(f"... and {len(items)-max_items} more {name.lower()}")

    print_category("PLACES", results['places'])
    print_category("PEOPLE", results['people'])
    print_category("KEYWORDS", results['keywords'])

def save_full_results(results: Dict, filename: str = "categorized_index.txt"):
    """Save complete categorized results to a text file"""
    if not results:
        print("No results to save")
        return

    print(f"\nSaving complete results to {filename}...")
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("="*50 + "\n")
            f.write("CATEGORIZED BOOK INDEX\n".center(50) + "\n")
            f.write("="*50 + "\n\n")

            f.write(f"Total Unique Words: {results['unique_word_count']:,}\n")
            f.write(f"Total Processed Words: {results['processed_word_count']:,}\n")
            f.write(f"Total Pages Analyzed: {results['total_page_count']}\n\n")

            def write_category(f, name: str, items: List[Tuple]):
                f.write(f"\n{'='*50}\n{name.upper()} ({len(items)} total)\n{'='*50}\n")
                for i, (item, count, pages) in enumerate(items, 1):
                    f.write(f"{i:4}. {item:<25} (appears {count:>3} times) - Pages: {', '.join(map(str, pages))}\n")

            write_category(f, "PLACES", results['places'])
            write_category(f, "PEOPLE", results['people'])
            write_category(f, "KEYWORDS", results['keywords'])

        print("Results saved successfully")
    except Exception as e:
        print(f"Error saving file: {str(e)}")

if __name__ == "__main__":
    try:
        # Step 1: Extract text
        pages_content, total_doc_words = extract_text_with_page_numbers("file.pdf")
        if not pages_content:
            print("No content extracted - exiting")
            exit()

        # Step 2: Analyze and categorize content
        results = analyze_content(pages_content)
        if not results:
            print("No analysis results - exiting")
            exit()

        # Step 3: Display and save results
        print_categorized_results(results)
        save_full_results(results)

    except KeyboardInterrupt:
        print("\nProcess interrupted by user")
    except Exception as e:
        print(f"\nError in main execution: {str(e)}")

Extracting text from PDF...
Processing page 10/580...
Processing page 20/580...
Processing page 30/580...
Processing page 40/580...
Processing page 50/580...
Processing page 60/580...
Processing page 70/580...
Processing page 80/580...
Processing page 90/580...
Processing page 100/580...
Processing page 110/580...
Processing page 120/580...
Processing page 130/580...
Processing page 140/580...
Processing page 150/580...
Processing page 160/580...
Processing page 170/580...
Processing page 180/580...
Processing page 190/580...
Processing page 200/580...
Processing page 210/580...
Processing page 220/580...
Processing page 230/580...
Processing page 240/580...
Processing page 250/580...
Processing page 260/580...
Processing page 270/580...
Processing page 280/580...
Processing page 290/580...
Processing page 300/580...
Processing page 310/580...
Processing page 320/580...
Processing page 330/580...
Processing page 340/580...
Processing page 350/580...
Processing page 360/580...
Processin

In [21]:
import pdfplumber
import spacy
from collections import defaultdict, Counter
import string
import time
from typing import List, Dict, Tuple, Set

# Load English model with optimized pipeline
nlp = spacy.load("en_core_web_sm", disable=["parser"])
nlp.max_length = 3_000_000  # Increased max length for large documents

def extract_text_with_page_numbers(pdf_path: str) -> Tuple[List[Dict], int]:
    """Extract text with page number tracking with progress updates"""
    pages_content = []
    total_words = 0
    print("Extracting text from PDF...")
    start_time = time.time()

    try:
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            for page_num, page in enumerate(pdf.pages, start=1):
                if page_num % 10 == 0 or page_num == total_pages:
                    print(f"Processing page {page_num}/{total_pages}...")

                page_text = page.extract_text()
                if page_text:
                    word_count = len(page_text.split())
                    total_words += word_count
                    pages_content.append({
                        "page": page_num,
                        "text": page_text,
                        "word_count": word_count
                    })

        elapsed = time.time() - start_time
        print(f"\nExtracted {total_words:,} words from {len(pages_content)} pages in {elapsed:.1f} seconds")
        return pages_content, total_words

    except Exception as e:
        print(f"\nError processing PDF: {str(e)}")
        return [], 0

def is_valid_keyword(word: str) -> bool:
    """Check if a word is valid for keyword inclusion"""
    if len(word) <= 2:
        return False

    invalid_conditions = [
        word.lower() in nlp.Defaults.stop_words,
        any(char.isdigit() for char in word),
        any(char in string.punctuation for char in word),
        not word.isascii()
    ]

    return not any(invalid_conditions)

def analyze_content(pages_content: List[Dict]) -> Dict:
    """Analyze content and categorize into places, people, and keywords with enhanced extraction"""
    print("\nAnalyzing and categorizing content...")
    start_time = time.time()

    results = {
        'places': defaultdict(list),
        'people': defaultdict(list),
        'keywords': defaultdict(list),
        'word_counts': Counter(),
        'all_words': set(),
        'processed_words': 0
    }

    try:
        for i, page in enumerate(pages_content, 1):
            page_num = page["page"]
            text = page["text"]

            if i % 10 == 0 or i == len(pages_content):
                print(f"Analyzing page {i}/{len(pages_content)}...")

            doc = nlp(text)
            results['processed_words'] += len([t for t in doc if t.is_alpha])

            # Enhanced entity processing
            for ent in doc.ents:
                normalized_text = ent.text.lower().strip()
                if not normalized_text:
                    continue

                if ent.label_ in ["GPE", "LOC"]:  # Geographical locations
                    results['places'][normalized_text].append(page_num)
                elif ent.label_ == "PERSON":  # People names
                    # Split multi-word names and filter out titles
                    parts = [p for p in normalized_text.split()
                            if p not in ['mr', 'mrs', 'ms', 'dr', 'prof']]
                    if parts:
                        name = ' '.join(parts)
                        results['people'][name].append(page_num)

            # Enhanced keyword processing
            for token in doc:
                if token.is_alpha and not token.is_stop:
                    word = token.lemma_.lower().strip()
                    if word:
                        results['all_words'].add(word)
                        results['word_counts'][word] += 1
                        if is_valid_keyword(word):
                            results['keywords'][word].append(page_num)

        # Prepare final categorized results with enhanced filtering
        def prepare_category(data: Dict, min_count: int = 2, max_items: int = None) -> List[Tuple]:
            # Sort by frequency then alphabetically
            sorted_items = sorted(
                [(word, len(pages), sorted(set(pages)))
                 for word, pages in data.items()
                 if len(pages) >= min_count],
                key=lambda x: (-x[1], x[0])
            )
            return sorted_items[:max_items] if max_items else sorted_items

        categorized = {
            'places': prepare_category(results['places'], min_count=2),  # All places meeting min count
            'people': prepare_category(results['people'], min_count=2),  # All people meeting min count
            'keywords': prepare_category(results['keywords'], min_count=3, max_items=1000),  # Top 1000 keywords
            'unique_word_count': len(results['all_words']),
            'processed_word_count': results['processed_words'],
            'total_page_count': len(pages_content)
        }

        elapsed = time.time() - start_time
        print(f"\nAnalysis completed in {elapsed:.1f} seconds")
        print(f"Found {len(categorized['places'])} places, {len(categorized['people'])} people, and {len(categorized['keywords'])} keywords")

        return categorized

    except Exception as e:
        print(f"\nError during analysis: {str(e)}")
        return {}

def print_categorized_results(results: Dict):
    """Print categorized results to the cell with enhanced display"""
    if not results:
        print("No results to display")
        return

    print("\n" + "="*50)
    print("ENHANCED CATEGORIZED RESULTS".center(50))
    print("="*50)

    print(f"\nTotal Unique Words: {results['unique_word_count']:,}")
    print(f"Total Processed Words: {results['processed_word_count']:,}")
    print(f"Total Pages: {results['total_page_count']}\n")

    def print_category(name: str, items: List[Tuple], max_items: int = None):
        max_display = max_items if max_items else len(items)
        print(f"\n{name.upper()} ({len(items)} total)")
        print("-"*50)
        for i, (item, count, pages) in enumerate(items[:max_display], 1):
            print(f"{i:4}. {item:<25} (appears {count:>3} times) - Pages: {', '.join(map(str, pages))}")
        if len(items) > max_display:
            print(f"... and {len(items)-max_display} more {name.lower()}")

    print_category("PLACES", results['places'])
    print_category("PEOPLE", results['people'])
    print_category("KEYWORDS", results['keywords'], max_items=50)  # Show top 50 keywords in console

def save_full_results(results: Dict, filename: str = "enhanced_categorized_index.txt"):
    """Save complete categorized results to a text file with all items"""
    if not results:
        print("No results to save")
        return

    print(f"\nSaving complete results to {filename}...")
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("="*50 + "\n")
            f.write("ENHANCED CATEGORIZED BOOK INDEX\n".center(50) + "\n")
            f.write("="*50 + "\n\n")

            f.write(f"Total Unique Words: {results['unique_word_count']:,}\n")
            f.write(f"Total Processed Words: {results['processed_word_count']:,}\n")
            f.write(f"Total Pages Analyzed: {results['total_page_count']}\n\n")

            def write_category(f, name: str, items: List[Tuple]):
                f.write(f"\n{'='*50}\n{name.upper()} ({len(items)} total)\n{'='*50}\n")
                for i, (item, count, pages) in enumerate(items, 1):
                    f.write(f"{i:5}. {item:<30} (appears {count:>3} times) - Pages: {', '.join(map(str, pages))}\n")

            write_category(f, "PLACES", results['places'])
            write_category(f, "PEOPLE", results['people'])
            write_category(f, "KEYWORDS", results['keywords'])

        print(f"Results saved successfully to {filename}")
    except Exception as e:
        print(f"Error saving file: {str(e)}")

if __name__ == "__main__":
    try:
        # Step 1: Extract text
        pages_content, total_doc_words = extract_text_with_page_numbers("file.pdf")
        if not pages_content:
            print("No content extracted - exiting")
            exit()

        # Step 2: Analyze and categorize content
        results = analyze_content(pages_content)
        if not results:
            print("No analysis results - exiting")
            exit()

        # Step 3: Display and save results
        print_categorized_results(results)
        save_full_results(results)

    except KeyboardInterrupt:
        print("\nProcess interrupted by user")
    except Exception as e:
        print(f"\nError in main execution: {str(e)}")

Extracting text from PDF...
Processing page 10/580...
Processing page 20/580...
Processing page 30/580...
Processing page 40/580...
Processing page 50/580...
Processing page 60/580...
Processing page 70/580...
Processing page 80/580...
Processing page 90/580...
Processing page 100/580...
Processing page 110/580...
Processing page 120/580...
Processing page 130/580...
Processing page 140/580...
Processing page 150/580...
Processing page 160/580...
Processing page 170/580...
Processing page 180/580...
Processing page 190/580...
Processing page 200/580...
Processing page 210/580...
Processing page 220/580...
Processing page 230/580...
Processing page 240/580...
Processing page 250/580...
Processing page 260/580...
Processing page 270/580...
Processing page 280/580...
Processing page 290/580...
Processing page 300/580...
Processing page 310/580...
Processing page 320/580...
Processing page 330/580...
Processing page 340/580...
Processing page 350/580...
Processing page 360/580...
Processin