In [1]:
import requests
import pdfplumber

In [2]:
def download_pdf(url, output_path):
    """Download a PDF file from the given URL."""
    response = requests.get(url)
    if response.status_code == 200:
        with open(output_path, 'wb') as file:
            file.write(response.content)
        print(f"PDF downloaded successfully to {output_path}")
    else:
        print(f"Failed to download PDF. Status code: {response.status_code}")

In [3]:
def extract_text_and_tables(pdf_path, output_txt_path):
    """Extract text and tables from a PDF file and save them into a text file."""
    with pdfplumber.open(pdf_path) as pdf, open(output_txt_path, 'w', encoding='utf-8') as output_file:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            tables = page.extract_tables()
            
            # Write extracted text
            output_file.write(f"Page {page_num}\n")
            output_file.write("Text:\n")
            output_file.write(text if text else "No text found on this page.\n")
            output_file.write("\n")
            
            # Write extracted tables
            if tables:
                output_file.write("Tables:\n")
                for table in tables:
                    for row in table:
                        output_file.write("\t".join(row) + "\n")
                    output_file.write("\n")
            else:
                output_file.write("No tables found on this page.\n")
            output_file.write("\n" + "="*50 + "\n")
    print(f"Text and tables saved to {output_txt_path}")

In [4]:
# Main Script
pdf_url = "https://www.vistajet.com/globalassets/documents/jettravelerreport.pdf"  # Replace with the actual URL of the PDF
pdf_path = "sample.pdf"
output_txt_path = "pdfplumber-test.txt"

In [None]:
# Download PDF
download_pdf(pdf_url, pdf_path)

In [None]:
# Extract text and tables
extract_text_and_tables(pdf_path, output_txt_path)