# Code

In [7]:
import fitz
import os
import pandas as pd

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

def extract_text_from_pdfs_in_directory(directory_path):
    pdf_texts = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)
            pdf_texts.append({"File Name": filename, "Extracted Text": text})
    return pdf_texts

def save_texts_to_excel(pdf_texts, excel_path):
    df = pd.DataFrame(pdf_texts)
    df.to_excel(excel_path, index=False)


# Result

In [8]:
# specify the path to your directory containing PDF files
directory_path = "/Users/loowenwen/Downloads/2024 Reports"

# extract text from all PDFs in the directory
extracted_texts = extract_text_from_pdfs_in_directory(directory_path)

# get the current working directory
current_directory = os.getcwd()

# specify the path to save the Excel file in the current working directory
excel_path = os.path.join(current_directory, "extracted_texts.xlsx")

# save the extracted texts to an Excel file
save_texts_to_excel(extracted_texts, excel_path)

print(f"Extracted texts have been saved to {excel_path}")

Extracted texts have been saved to /Users/loowenwen/Desktop/Visual Code Studio/jtc-internship/9. consultancy reports/extracted_texts.xlsx


# Further Refining the Code

In [14]:
import fitz  # PyMuPDF
import os
import pandas as pd
import re
from unidecode import unidecode

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

def clean_text(text):
    # replace non-ASCII characters using unidecode
    cleaned_text = unidecode(text)
    # further clean text by removing any other unwanted characters
    cleaned_text = re.sub(r'[^\x00-\x7F]+', ' ', cleaned_text)  # remove non-ASCII characters
    # remove illegal characters for Excel
    cleaned_text = re.sub(r'[\x00-\x1F\x7F]', ' ', cleaned_text)  # remove non-printable characters
    return cleaned_text

def extract_text_from_pdfs_in_directory(directory_path):
    pdf_texts = []
    for root, dirs, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(root, filename)
                text = extract_text_from_pdf(pdf_path)
                cleaned_text = clean_text(text)
                relative_path = os.path.relpath(pdf_path, directory_path)
                pdf_texts.append({"File Name": filename, "Extracted Text": cleaned_text, "Path": relative_path})
    return pdf_texts

def save_texts_to_excel(pdf_texts, excel_path):
    df = pd.DataFrame(pdf_texts)
    df.to_excel(excel_path, index=False)

# specify the path to your directory containing PDF files
directory_path = "/Users/loowenwen/Desktop/JTC Internship/Industrial Reports"

# extract text from all PDFs in the directory and subdirectories
extracted_texts = extract_text_from_pdfs_in_directory(directory_path)

# get the current working directory
current_directory = os.getcwd()

# specify the path to save the Excel file in the current working directory
excel_path = os.path.join(current_directory, "extracted_texts.xlsx")

# save the extracted texts to an Excel file
save_texts_to_excel(extracted_texts, excel_path)

print(f"Extracted texts have been saved to {excel_path}")

Extracted texts have been saved to /Users/loowenwen/Desktop/Visual Code Studio/jtc-internship/9. consultancy reports/extracted_texts.xlsx
