In [1]:
from openai import OpenAI
from IPython.display import display
from IPython.display import Image as ip_image
import requests
import base64
import requests
import re
import os
import concurrent.futures

import fitz  # PyMuPDF
from glob import glob
import pytesseract
from PIL import Image

In [2]:
def display_image_from_url(url):
    # Fetch the image from the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Display the image
        display(ip_image(response.content))
    else:
        print("Failed to retrieve the image.")

def save_image(url, save_path):
    # Fetch the image from the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Save the image to the specified path
        with open(save_path, 'wb') as file:
            file.write(response.content)


# Function to extract text from an image
def extract_text_from_image(image_path):
    # Open the image file
    with Image.open(image_path) as img:
        # Use Tesseract to do OCR on the image
        text = pytesseract.image_to_string(img)

    return text

def cost(response):
    c = response.usage.completion_tokens*0.03/1000 + response.usage.prompt_tokens*.01/1000
    formatted_c = "${:.2f}".format(c)
    print(f'Cost: {formatted_c}')

def identify_url(s):
    # Regular expression for checking if it's a URL
    url_pattern = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
        r'localhost|' # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
        r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

    # Check if the string is a URL
    if re.match(url_pattern, s):
        return True
    # Check if the string is a file
    elif os.path.isfile(s):
        return False


# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


instructions = '''
Your role is to take images of text from books, convert them into markdown format enclosed within a markdown code block, and list any corrections made during the process.
Include all relevant text.
Line breaks within paragraphs should be replaced with spaces, combining words split by hyphens. 
Ensure that all tables are recreated accurately in markdown.
Include all photo captions.
When presenting the markdown-formatted text, it should always use triple backticks (```) at the start and end to encase the markdown content in a code block.
It should use '*' for italics, '#' and '##' for headers, avoid smart quotes, and use '---' with a space on each side for em dashes.
Combine texts if split across multiple columns or images, especially if it splits a word, sentence or paragarph.
You should also correct any likely errors in the text, emphasizing accuracy in text recognition and markdown formatting, and being cautious about altering the original language of the text. 
If you encounter any words that are not clear due to the quality of the image, make a best guess and annotate it with a question mark in brackets [?]
When converting headlines in all caps, it should replace them with title case. 
Review your work carefully. This work will be widely-reproduced so accuracy is incredibly important. 
You should communicate in a helpful and precise manner, effectively meeting the user's needs for text conversion, error correction, and providing a summary of any corrections made.
'''

instructions = '''
// Define the role of GPT in processing images of text from books

GPT_Process_Text_Images:
    // Take images containing text or texts  from books
    input: sequence_of_text_images

    // Operations to perform on the sequence of text images
    for each image in sequence_of_text_images:
        convert image to text // OCR (Optical Character Recognition)
        append converted text to combined_text // Continuation of text

    // Operations to perform on the text images
    for each image in images_of_text:
        convert image to text // OCR (Optical Character Recognition)
        format text in markdown:
            encase text in triple backticks (```) for code block
            use '*' for italics
            replace smart quotes with straight quotes
            use '---' with space on each side for em dashes
            use blank line between paragraphs
            correct any likely errors in text // emphasize accuracy
            for headlines in all caps:
                convert to title case
                Use  # for header or ## for subheader
            for text segments in all caps:
                format as bold in markdown
            if text contains table-like data:
                format as markdown table:
                    use '|' to separate columns
                    use '-' for header separators
                    align text as per original table layout
        // If text clarity or markdown formatting is uncertain
        if text_is_unclear or formatting_requirements_ambiguous:
            request clarification from user
        // If text is the running header on a page or the page number
              do not output.

        // Present the formatted markdown text
        output formatted_text in markdown code block

        // List any corrections or commissions  made during the process
        list corrections_made

    // Ensure communication is helpful, precise, and meets user needs
    communicate_effectively

// End of GPT processing function
'''

def ocr_image(fn_url_or_urls, message = 'Transcribe this article into Markdown.', out_fn='text.md'):
    
    if type(fn_url_or_urls) == str:
        print("Processing one image.")
        image_locations = [fn_url_or_urls]
    else:
        print(f"Processing {len(fn_url_or_urls)} images.")

        image_locations = fn_url_or_urls

    
    
    user_message = [{"type": "text", "text": message}]
    
    
    for image_location in image_locations:
        if identify_url(image_location) == True:
            iu = image_location
        else: 
            base64_image = encode_image(image_location)
            iu = f"data:image/jpeg;base64,{base64_image}"
        d = {"type": "image_url",  "image_url": iu }
        user_message.append(d)


    client = OpenAI(max_retries=5, timeout=50, api_key='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')
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {'role' : 'system',
            'content': instructions},
            {
                "role": "user", "content": user_message
                ,
            }
        ],
        max_tokens=4000,
    )
    cost(response)
    
    md = response.choices[0].message.content
    
    with open(out_fn.replace('.md','.txt'), 'w') as outfile:
        outfile.write(md)
        
    # output the markdown
    just_md = md.split('```')[1].split('```')[-1].replace('markdown\n','')
    with open(out_fn, 'w') as outfile:
        outfile.write(just_md)
    return md


In [3]:
def pdf_to_png_high_quality(pdf_path, dpi=1200):
    """
    Converts each page of a given PDF into separate high-quality PNG files.
    Saves the PNG files in a folder named after the PDF file.
    """
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = os.path.join(os.path.dirname(pdf_path), pdf_name)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    doc = fitz.open(pdf_path)

    # Iterate through each page
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        # Set the zoom based on DPI
        zoom = dpi / 72  # PyMuPDF's default DPI is 72
        mat = fitz.Matrix(zoom, zoom)

        # Render page with anti-aliasing and high resolution
        pix = page.get_pixmap(matrix=mat, alpha=False, dpi=300)

        output_path = os.path.join(output_dir, f"{pdf_name}_page_{page_num + 1}.png")
        pix.save(output_path)

    doc.close()

    return output_dir

In [4]:
import os
import fitz  # PyMuPDF
import concurrent.futures

def convert_page_to_png(pdf_path, page_num, output_dir, dpi=1200):
    """
    Converts a single page of a PDF to a high-quality black and white PNG.
    """
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    doc = fitz.open(pdf_path)

    page = doc.load_page(page_num)
    zoom = dpi / 72  # PyMuPDF's default DPI is 72
    mat = fitz.Matrix(zoom, zoom)

    # Render page in grayscale
    pix = page.get_pixmap(matrix=mat, alpha=False, grayscale=True)

    output_path = os.path.join(output_dir, f"{pdf_name}_page_{page_num + 1}.png")
    pix.save(output_path)

    doc.close()

def pdf_to_png_high_quality(pdf_path, dpi=1200):
    """
    Converts each page of a given PDF into separate high-quality black and white PNG files.
    Saves the PNG files in a folder named after the PDF file.
    """
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = os.path.join(os.path.dirname(pdf_path), pdf_name)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    doc.close()

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(convert_page_to_png, pdf_path, page_num, output_dir, dpi)
                   for page_num in range(total_pages)]

        # Wait for all futures to complete
        concurrent.futures.wait(futures)

    return output_dir


In [5]:
from PIL import Image, ImageEnhance
import os

def convert_images_to_black_and_white(directory):
    """
    Converts all images in the specified directory to black and white.
    """
    for filename in os.listdir(directory):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(directory, filename)
            with Image.open(image_path) as img:
                grayscale_img = img.convert("L")
                
                # Enhance contrast
                enhancer = ImageEnhance.Contrast(grayscale_img)
                high_contrast_img = enhancer.enhance(5)

                grayscale_img.save(image_path)
                #high_contrast_img.save(image_path)




In [6]:
from pdf2image import convert_from_path
import os
from PIL import Image

def pdf_to_png_alternative(pdf_path, dpi=144):
    """
    Converts each page of a given PDF into separate PNG files using pdf2image and Pillow.
    Saves the PNG files in a folder named after the PDF file.
    """
    # Extract the PDF file name without extension and create output directory
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = os.path.join(os.path.dirname(pdf_path), pdf_name)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Convert PDF to a list of images
    images = convert_from_path(pdf_path, dpi=dpi, fmt='png')

    # Save each image as a PNG file
    for i, image in enumerate(images):
        page_number = str(i + 1).zfill(3)  # This will add leading zeros to make it 3 digits
        output_path = os.path.join(output_dir, f"{pdf_name}_page_{page_number}.png")
        image.save(output_path, 'PNG')

    convert_images_to_black_and_white(output_dir)
    return output_dir


In [7]:
def process_image(fn):
    out_fn = fn.replace('.png', '.md')

    if not os.path.exists(out_fn):
        md = ocr_image(fn, out_fn=out_fn)
        return md
    else:
        return None  # or some other indication that the file was already processed

In [8]:
def read_the_md(fn):
    with open(fn, 'r') as infile:
        md = infile.read()
    return md

In [14]:
def pdf_to_md(pdf_fn):
    pdf_to_png_alternative(pdf_fn)

    folder = pdf_fn.split('.pdf')[0]
    image_fns = sorted(glob(f'{folder}/*.png'))


    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(process_image, image_fns))

    md_fns = sorted(glob(f'{folder}/*.md'))

    md = '\n'.join([read_the_md(md_fn) for md_fn in md_fns])

    md_fn = pdf_fn.replace('.pdf','.md')

    with open(md_fn, 'w') as outfile:
        outfile.write(md)

In [16]:
pdf_to_md('test.pdf')

Processing one image.Processing one image.

Processing one image.
Processing one image.
Processing one image.
Processing one image.
Processing one image.
Processing one image.
Processing one image.
Processing one image.
Cost: $0.02
Cost: $0.03
Cost: $0.03
Cost: $0.03
Cost: $0.03
Cost: $0.03
Cost: $0.03
Cost: $0.04
Cost: $0.03
Cost: $0.03


In [None]:
import argparse

def main():
    parser = argparse.ArgumentParser(description="Convert PDF to Markdown")
    parser.add_argument('pdf_fn', type=str, help="Path to the PDF file")
    args = parser.parse_args()

    # Call your function here
    pdf_to_md(args.pdf_fn)

if __name__ == "__main__":
    main()

In [None]:
import subprocess
import sys

def clip_content(url=None, input_file=None, output_format='markdown', output_file='output.md'):
    """
    Uses clipper.js to clip content from a URL or a file and save it in the specified format.

    :param url: URL to clip content from
    :param input_file: Input file (HTML) to clip content from
    :param output_format: Output format (markdown, json)
    :param output_file: Output file for clipped content
    """
    try:
        # Constructing the command
        command = ['clipper', 'clip']
        
        if url:
            command.extend(['-u', url])
        elif input_file:
            command.extend(['-i', input_file])
        else:
            raise ValueError("Either URL or input file must be provided.")

        command.extend(['-f', output_format, '-o', output_file])

        # Running the command
        result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        print(f"Content clipped successfully. Output saved in {output_file}")

    except subprocess.CalledProcessError as e:
        print(f"Error occurred: {e.stderr}", file=sys.stderr)
        sys.exit(1)
    except ValueError as e:
        print(e, file=sys.stderr)
        sys.exit(1)

# Example usage
clip_content(url="https://example.com")
# Or to clip content from a file: clip_content(input_file="path/to/file.html")
