# Importing the Necessary Libraries

The first step is to import the necessary libraries in order to use this extractor. This tool is used for extracting text from PDFs, Image-Only PDFs, and from a url leading to a webpage, so a variety of libraries are needed in order to for it to accommodate the aforementioned sources. Each of these packages must be installed prior to running this code in order for it to function properly.

In [1]:
import fitz  # PyMuPDF
import spacy
from docx import Document
import easyocr
import requests
from bs4 import BeautifulSoup
import numpy as np
import warnings
from bs4 import Tag
from PIL import Image, ImageEnhance
from io import BytesIO
import cv2

# Disabling User Warnings
The second step is to disable user warnings, though this is done out of personal preference. Nevertheless, there are benefits to keeping them enabled as well, so if wish to keep them active, simply skip this step.

In [2]:
warnings.simplefilter(action='ignore', category=UserWarning)

# Defining Our Functions
The third step is to define our functions. Ideally, we want our code to execute the same way every time. Therefore, the best way to do this is to wrap bits of our code in such a way that it does the same thing each time it is needed/called upon. This, in essence, is what defining our functions allows us to do.

### Function that Checks Pages for Text-Only Content

In [3]:
def is_text_only_page(page):
    # Code to check if the page contains only text (no images)
    return not page.get_images(full=True)

### Function that Checks Pages for Image Content

In [4]:
def is_image_page(page):
    # Code to check if the page contains images
    return bool(page.get_images(full=True))

### Function that Converts PDF Content to Text

In [5]:
def pdf_to_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    has_images = False

    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()

        # Code to check if the page contains images
        if not has_images and is_image_page(page):
            has_images = True

    doc.close()
    return text, has_images

### Function that Extracts Images from the PDF if Detected

In [6]:
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc[page_num]
        images += page.get_images(full=True)

    doc.close()
    return images

### Function that Preprocesses Extracted Images Using OpenCV

In [7]:
def preprocess_image_opencv(image, resize_factor=10, desired_dpi=300):
    # Code to resize the image by a factor of 10
    resized_image = cv2.resize(image, None, fx=resize_factor, fy=resize_factor, interpolation=cv2.INTER_CUBIC)

    # Code to convert the image to grayscale
    gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)

    # Code to apply a Gaussian blur to reduce noise
    blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)

    # Code to apply adaptive thresholding to enhance contrast
    _, threshold_image = cv2.threshold(blurred_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Code to apply morphological transformations to further clean the image
    kernel = np.ones((3, 3), np.uint8)
    processed_image = cv2.morphologyEx(threshold_image, cv2.MORPH_CLOSE, kernel, iterations=2)

    # Code to adjust the intensity of black pixels in the processed image
    processed_image = cv2.subtract(255, processed_image)

    return processed_image

### Function that Extracts Text from these Images Using EasyOCR

In [8]:
def extract_text_from_image(image):
    # Code used to load the preprocessed image
    preprocessed_image = preprocess_image_opencv(image)

    # Code used to enable easyocr for text extraction
    reader = easyocr.Reader(['en'])
    result = reader.readtext(preprocessed_image)

    # Code used to extract text from the result
    text = ' '.join([entry[1] for entry in result])
    return text

### Function that Extracts Text from Websites Via URL

In [9]:
def extract_text_from_website(url):
    # Code used to fetch HTML content from the website
    response = requests.get(url)
    html_content = response.text

    # Code used to enable BeautifulSoup to parse HTML and extract text
    soup = BeautifulSoup(html_content, 'html.parser')

    def process_list_items(list_tag, level=0):
        # Recursively process list items and maintain indentation
        items = list_tag.find_all('li', recursive=False)
        tags = []
        for index, item in enumerate(items, start=1):
            text = f"{'  ' * level}{'â€¢ ' if list_tag.name == 'ul' else f'{index}. '}{item.get_text(strip=True)}"
            if item.find_all(['ul', 'ol']):
                tags.extend(process_list_items(item, level + 1))
            tag = Tag(name='text')
            tag.string = text
            tag.level = level
            tags.append(tag)
        return tags

    # Code used to extract text from headers (h1, h2, etc.)
    headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    header_tags = [Tag(name='text') for h in headers]
    for i, h in enumerate(headers):
        header_tags[i].string = h.get_text(strip=True)
        header_tags[i].level = 0

    # Code used to extract text from paragraphs
    paragraphs = soup.find_all('p')
    text_tags = [Tag(name='text') for p in paragraphs]
    for i, p in enumerate(paragraphs):
        text_tags[i].string = p.get_text(strip=True)
        text_tags[i].level = 0

    # Code used to extract text from lists
    lists = soup.find_all(['ul', 'ol'])
    for lst in lists:
        text_tags.extend(process_list_items(lst))

    # Code used to sort the tags based on their order of appearance on the website
    all_tags = header_tags + text_tags
    all_tags.sort(key=lambda x: x.previous_element.index(x) if x.previous_element else 0)

    return '\n'.join(tag.string for tag in all_tags)

### Function that Processes Text Using spaCy

In [10]:
def process_text_with_spacy(text):
    # Code used to disable spaCy's named entity recognition (NER) component
    nlp = spacy.load("en_core_web_sm", disable=["ner"])
    doc = nlp(text)
    return doc

### Function that Extracts Text Processed Using spaCy

In [11]:
def extract_text_spacy(doc):
    return doc.text

### Function that Converts the Extracted Text to DOCX format

In [12]:
def text_to_docx(text, output_path="output.docx"):
    doc = Document()

    # Code used to split the text into lines
    lines = text.split('\n')

    for line in lines:
        # If statement used to skip empty lines
        if line.strip():
            # If statement used to check if the document already has content
            if doc.paragraphs:
                # Code to add a new paragraph for each additional line
                doc.add_paragraph(line)
            else:
                # Code used to add the first line without an additional paragraph
                doc.add_paragraph(line, style='BodyText')

    doc.save(output_path)

# Text to DOCX Converter
Finally, we have our code that allows for text from PDFs, Image-Only PDFs, and Websites to be extracted and sent to Word Document in DOCX format. In this Notebook, I will demonstrate how to extract text from a PDF simply by running this code. Everyday benefits of this are for instances where information from massive multi-page PDFs must be copied in order to generate reports using an application like Microsoft Word. Normally, copying text in this nature would involve simply highlihgting the text, and then copying and pasting it into a Word document of your own creation. However, for larger documents with 100+ pages, for example, there is a risk of one's grip on the left mouse button slipping and thus eliminating all of their progress. This aspect of the code was created for this purpose in order to save time copying and pasting content from highlightable PDFs and eliminate frustration. Running this on a local machine is helpful when dealing with confidential documents.

Using this converter is simple, as all one has to do is load the path to their PDF in the "pdf_path" variable. For this demonstration, I have selected the textdemo.pdf file via the path it exists in on my local machine. Next, one simply must define the "output_path" as the location they want their outputted DOCX file to be in. In my case, I named it "textdemo.docx" and decided to store it in the same folder as my original PDF. Also worth noting, when not using the Website extraction feature, it is best to just keep the path to that assigned as 'https://none.com'.

With all of that in mind, simply run the code and the text from the PDF is extracted to a DOCX file. In this case, the original PDF contains text that reads, "This PDF contains text for demonstration."

In [13]:
if __name__ == "__main__":
    pdf_path = r'C:\\projects\\textdemo.pdf'  # Change this to your PDF file path
    website_url = 'https://none.com'  # Change this to the desired website URL in https://example.com format if you wish to extract text from websites

    # Code used to extract text from PDF
    extracted_text, has_images = pdf_to_text(pdf_path)

    # Code used to extract text from website
    if website_url:
        website_text = extract_text_from_website(website_url)
        extracted_text += "\n" + website_text

    if has_images:
        # Code used to extract text from images
        doc = fitz.open(pdf_path)
        for page_num in range(doc.page_count):
            page = doc[page_num]
            if is_image_page(page):
                image = page.get_pixmap()
                image_text = extract_text_from_image(np.frombuffer(image.samples, dtype=np.uint8).reshape((image.h, image.w, image.n)))
                extracted_text += "\n" + image_text

        doc.close()
        

    # Process the text with spaCy
    spacy_doc = process_text_with_spacy(extracted_text)
    final_extracted_text = extract_text_spacy(spacy_doc)

    # Output to Word document
    output_path = r'C:\\projects\\textdemo.docx'  # Change this to your desired output path
    text_to_docx(final_extracted_text, output_path)

    print(f"Text extracted and saved to {output_path}")

Text extracted and saved to C:\\projects\\textdemo.docx


And now so does the DOCX file.

# Reading the Extracted Text Without Opening the Document
As a bonus, I wanted to write some code that may be used to print the extracted text for exemplary purposes. Good reasons to do this may be that you simply want to examine the contents of the extracted DOCX file without the need to actually open it. This is useful if you happen to be running this on a machine that does not have Microsoft Word installed. By that logic, you could use this code to validate its contents and then email the DOCX file to a machine that has the necessary software to edit it. For a scenario like this, I recommend using Google Docs as DOCX files can be uploaded there if needed.

In [14]:
print("Extracted Text:", extracted_text)

Extracted Text: This PDF contains text for demonstration. 
 


