# Import libraries

In [1]:
import base64
import os
import time
from PIL import Image
from io import BytesIO
import requests
from datetime import datetime

# Functions
The following cell includes functions to digitize image scans using the multi-modal capabilities OpenAI's GPT-4. Further down below, you can run the LLM digitization pipeline.

In [14]:
# Function to encode the PNG-image to base64 in order to send it to the OpenAI API
def encode_image(image_path):
    with Image.open(image_path) as img:
        with BytesIO() as buffer:
            img.save(buffer, format='PNG')
            buffer.seek(0)
            return base64.b64encode(buffer.read()).decode('utf-8')

# Function to send image based on image path to the OpenAI API 
# Modify task prompt below
# Valid OpenAI API key is needed
def ocr_openai_api(image_path, api_key):
    base64_image = encode_image(image_path)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Transcribe the text from the attached image of an old Spanish craft guild ordinance using these guidelines: 1. ALWAYS transcribe as written in the textual image. Be aware of archaic spellings and special characters. 2. Only provide the Spanish transcription, nothing else. UNDER NO CIRCUMSTANCES should you provide anything else than the exact Spanish transcription. 3. The guild ordinance often contain enumerations. Provide those enumerations as plain numbers. For example, '2\n' 4. Always exclude foot and page numbers."                    
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 4000
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    if response.status_code == 200:
        # Print token usage
        print(response.json()['usage'])

        # Print time 
        print(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
        
        return response.json()['choices'][0]['message']['content']
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

# Function to iterate through every folder and automate the LLM-OCR pipeline
# Each folder contains a series of scans from an ordinance, organised chronologically
def process_folders(folders, api_key):
    for folder in folders:
        for subdir, dirs, files in os.walk(folder):
            # Sort the dirs list to ensure alphabetical order
            dirs.sort()  # Sort the dirs list in place

            # Skip processing if the current directory is the root folder
            if subdir == folder:
                continue

            # Sort the files list to ensure alphabetical order
            files.sort()
            ocr_texts = []
            for i, file in enumerate(files):
                if file.endswith('.tif'):
                    file_path = os.path.join(subdir, file)
                    print(f"Processing image {i} of {len(files)-1} {file_path}")

                    # Convert .tif to .png
                    img = Image.open(file_path)
                    file_path = file_path.rsplit('.', 1)[0] + '.png'
                    img.save(file_path, 'PNG')

                    ocr_text = ocr_openai_api(file_path, api_key)
                    if ocr_text:
                        ocr_texts.append(ocr_text)
                    else:
                        successful = False
                        while not successful:
                            print("Error or rate limit reached. Waiting 10 seconds before retrying.")
                            time.sleep(10)
                            ocr_text = ocr_openai_api(file_path, api_key)
                            if ocr_text:
                                ocr_texts.append(ocr_text)
                                print("Retry successful.")
                                successful = True

            # Write OCR texts to a single file
            current_directory = os.getcwd()

            # Output folder
            output_folder = os.path.join(current_directory, "..", "llm_digitization_data", "full_text_data", os.path.basename(folder))
            print(f'Output folder: {output_folder}')

            # Output file
            output_file_path = os.path.join(output_folder, f"{os.path.basename(subdir)}.txt")
            print(f'Save txt-file as: {output_file_path}')

            # Save output file
            with open(output_file_path, 'w') as f:
                f.write('\n'.join(ocr_texts))
            print(f"Completed processing for {subdir}. Output saved to {output_file_path}")

# Your OpenAI API Key
You can generate a project API key on https://platform.openai.com/api-keys

In [None]:
# OpenAI API key
api_key = "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" # paste your generated API key over here

# Run LLM-OCR pipeline by processing Mexico and Peru folder

In [9]:
# Getting path of current working directory
current_directory = os.getcwd()
print(f"Current directory: {current_directory}")

# Path to top-level folders containing subfolders of images
mexico_folder_path = os.path.normpath(os.path.join(current_directory, '../llm_digitization_data/archival_image_scans/mexico'))
peru_folder_path = os.path.normpath(os.path.join(current_directory, '../llm_digitization_data/archival_images_scans/peru'))

# Print paths to check if they are set up correctly on your machine
print(f"Mexico folder path: {mexico_folder_path}")
print(f"Peru folder path: {peru_folder_path}")

Current directory: /Users/niclasgriesshaber/Desktop/guilds-llm/01_llm_digitization/llm_digitization_code
Mexico folder path: /Users/niclasgriesshaber/Desktop/guilds-llm/01_llm_digitization/llm_digitization_data/archival_image_scans/mexico
Peru folder path: /Users/niclasgriesshaber/Desktop/guilds-llm/01_llm_digitization/llm_digitization_data/archival_images_scans/peru


In [16]:
# Process both folders
process_folders([mexico_folder_path, peru_folder_path], api_key)
print('Done!')

Processing image 0 of 4 /Users/niclasgriesshaber/Desktop/guilds-nlp/data/image_data/original_scans/mexico/1561_hatters/20230415160311164_0002.tif
{'prompt_tokens': 1221, 'completion_tokens': 728, 'total_tokens': 1949}
01/03/2024 00:20:29
Processing image 1 of 4 /Users/niclasgriesshaber/Desktop/guilds-nlp/data/image_data/original_scans/mexico/1561_hatters/20230415160311164_0003.tif
{'prompt_tokens': 1221, 'completion_tokens': 1100, 'total_tokens': 2321}
01/03/2024 00:21:51
Processing image 2 of 4 /Users/niclasgriesshaber/Desktop/guilds-nlp/data/image_data/original_scans/mexico/1561_hatters/20230415160311164_0004.tif
{'prompt_tokens': 1221, 'completion_tokens': 1208, 'total_tokens': 2429}
01/03/2024 00:22:37
Processing image 3 of 4 /Users/niclasgriesshaber/Desktop/guilds-nlp/data/image_data/original_scans/mexico/1561_hatters/20230415160311164_0005.tif
{'prompt_tokens': 1221, 'completion_tokens': 1120, 'total_tokens': 2341}
01/03/2024 00:24:16
Processing image 4 of 4 /Users/niclasgriessha