# Create a dictionary using LLM (for new datasets)

### Use LLM to classify keywords from extracted OCR

##### 1. Extract keywords from sample masked images (focused on specified image regions)

##### 2. Classify using LLM and save to txt file

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Define the target directory
target_directory = '/content/drive/MyDrive'

# Create the directory if it doesn't exist
os.makedirs(target_directory, exist_ok=True)

In [9]:
!apt-get install -y tesseract-ocr
%pip install opencv-python-headless
%pip install pytesseract
%pip install openai

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [5]:
# Step 1: Verify and download the language data file if necessary
!ls /usr/share/tesseract-ocr/4.00/tessdata
!wget -P /usr/share/tesseract-ocr/4.00/tessdata https://github.com/tesseract-ocr/tessdata/raw/main/spa.traineddata # replace with dataset main language
!wget -P /usr/share/tesseract-ocr/4.00/tessdata https://github.com/tesseract-ocr/tessdata/raw/main/cat.traineddata # replace with dataset secondary language(s)

# Step 2: Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'

# Step 3: Verify the installation
!tesseract --list-langs

configs  eng.traineddata  osd.traineddata  pdf.ttf  tessconfigs
--2025-08-13 14:21:58--  https://github.com/tesseract-ocr/tessdata/raw/main/spa.traineddata
Resolving github.com (github.com)... 140.82.116.4
Connecting to github.com (github.com)|140.82.116.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/spa.traineddata [following]
--2025-08-13 14:21:58--  https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/spa.traineddata
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18256019 (17M) [application/octet-stream]
Saving to: ‘/usr/share/tesseract-ocr/4.00/tessdata/spa.traineddata’


2025-08-13 14:21:58 (187 MB/s) - ‘/usr/share/tesseract-ocr/4.00/tessdata/spa.tr

In [10]:
import os
import pytesseract
from pytesseract import Output
from PIL import Image
import csv
import pandas as pd
from openai import OpenAI

In [8]:
# Function to perform OCR with a given PSM mode
def perform_ocr(image_path, psm_mode):
    custom_config = f'--psm {psm_mode}'
    ocr_data = pytesseract.image_to_data(Image.open(image_path), config=custom_config, lang='spa+cat', output_type=Output.DICT)
    return ocr_data

# Function to clean text by removing excess spaces
def clean_text(text):
    return ' '.join(text.split())

# Function to merge OCR results from different PSM modes and remove duplicates
def merge_ocr_results(ocr_data_list):
    merged_data = {'text': []}
    seen_texts = set()

    for ocr_data in ocr_data_list:
        for i in range(len(ocr_data['text'])):
            cleaned_text = clean_text(ocr_data['text'][i])
            if len(cleaned_text) >= 3 and cleaned_text not in seen_texts:
                seen_texts.add(cleaned_text)
                merged_data['text'].append(cleaned_text)

    return merged_data

# Function to save OCR data to a CSV file
def save_to_csv(ocr_data, csv_filename):
    with open(csv_filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['text'])
        for text in ocr_data['text']:
            writer.writerow([text])

# Directories
image_dir = '/content/drive/MyDrive/ICDAR_workshop/Github_AnonED/mask_applied' # (REPLACE FILE PATH)
output_dir = '/content/drive/MyDrive/ICDAR_workshop/Github_AnonED/ocr_outputs' # (REPLACE FILE PATH)

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Loop through all image files in the directory
for filename in os.listdir(image_dir):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
        image_path = os.path.join(image_dir, filename)
        print(f"Processing: {filename}")

        # Perform OCR with PSM 3, 5, and 12
        ocr_data_psm3 = perform_ocr(image_path, 3)
        ocr_data_psm5 = perform_ocr(image_path, 5)
        ocr_data_psm12 = perform_ocr(image_path, 12)

        # Merge and clean OCR results
        merged_ocr_data = merge_ocr_results([ocr_data_psm3, ocr_data_psm5, ocr_data_psm12])

        # Save to CSV in the output directory
        csv_filename = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_ocr.csv")
        save_to_csv(merged_ocr_data, csv_filename)

print("OCR processing completed for all images.")

Processing: a_16_page_1.png
Processing: a_9_page_1.png
Processing: a_14_page_1.png
Processing: a_2_page_1.png
Processing: a_41_page_1.png
OCR processing completed for all images.


In [11]:
# Function to combine extracted text

# Directory containing CSV files
csv_dir = '/content/drive/MyDrive/ICDAR_workshop/Github_AnonED/ocr_outputs'  # Replace with your actual folder path

# Set to store unique text entries
unique_texts = set()

# Loop through all CSV files in the directory
for filename in os.listdir(csv_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(csv_dir, filename)
        try:
            df = pd.read_csv(file_path)
            if 'text' in df.columns:
                for text in df['text'].dropna():
                    cleaned_text = str(text).strip()
                    if cleaned_text:
                        unique_texts.add(cleaned_text)
        except Exception as e:
            print(f"Error reading {filename}: {e}")

# Convert to list if needed
unique_text_list = list(unique_texts)

# Print result
print(f"Found {len(unique_text_list)} unique text entries.")


Found 335 unique text entries.


In [14]:
# Set OpenAI API key
client = OpenAI(api_key="your-openai-api-key")

# Define a prompt template
def generate_prompt(text):
    return f"""
You are a highly intelligent assistant tasked with classifying text into two groups: "title block" and "other".
Given the following text in Castilian Spanish, Catalan or Galician, classify it based on the following criteria:
A token is classified as "title block" if it is:
  - a profession or title (e.g., engineer, architect, etc.)
  - a title block term (e.g., title, date, scale, etc.)
  - a drawing role (preparer, reviewer, sign off, etc.)
  - a project descriptor (renovation, upgrade, installation)
  - a location descriptor or name of a place (municipality, city, province, etc.) - use Named Entity Recognition
  - a name (such as a first name, middle name or surname) - use Named Entity Recognition
  - a company name (especially energy distribution companies), contact details (email, phone number, website, link)
  - a code number (alphanumeric code greater than 5 characters)
  - an acronym of any chapter of the Oficial Colleges of Technical Engineers of Spain (for example, any branch of COGITI)
  - a brand name (such as an OEM manufacturer such as Philips, ABB, etc.)
  - a date, month or year
Otherwise, it is classified as "other".
There may be minor spelling errors in the OCR text output.

Text: {text}

Output:
"""

# Define the function to call the OpenAI API
def classify_text(text):
    prompt = generate_prompt(text)
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",  # Specify the model
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=300,
            temperature=0  # Deterministic output
        )
        # Extract response content
        result = response.choices[0].message.content.strip()
        return result
    except Exception as e:
        print(f"Error: {e}")
        return None


# Convert list to DataFrame
df = pd.DataFrame({'text': unique_text_list})

# Apply the function to classify text and filter out 'other' category

# Apply classification
df['Classification'] = df['text'].apply(classify_text)

# Filter rows where classification is 'title block'
df_tb = df[df['Classification'] == 'title block']

# Save the filtered data to a new .txt file (REPLACE FILE PATH)
df_tb['text'].to_csv('/content/drive/MyDrive/ICDAR_workshop/Github_AnonED/4o_mini_dict.txt', index=False, header=False)

print("Text classification completed and filtered data saved to .txt file.")

Text classification completed and filtered data saved to .txt file.
