<a href="https://colab.research.google.com/github/akhil27/5731_project/blob/main/INFO_5731_GROUP4_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install requests



In [1]:
!apt-get install tesseract-ocr
!pip install pytesseract


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [3]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Pamphlet.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Pamphlet")):
    base_folder = os.path.join(extract_path, "Pamphlet")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: 17th Annual Country Club Golf Tournament [Flyer]
- Creator: Lopez, J. Lewis (Tournament Chairman)
- Subject: Golf tournaments, country clubs, leisure activities, Galveston
- Description: A flyer providing a schedule for the 17th annual Country Club Golf Tournament held in Galveston, Texas. It includes dates, social events, and organizational information associated with the event.
- Publisher: Rosenberg Library
- Contributor: Kempner, Harris Leon (Correspondent)
- Date: 1960-08-11/1960-08-14
- Type: Text
- Format: Pamphlet; [1] page, 26 x 21 cm; JPEG
- Identifier: ark:/67531/metapth1420610
- Source: Harris and Eliza Kempner Collection, MS 80-0002
- Language: English
- Relation: Part of the Personal Papers (MS 80-0002) series
- Coverage: Galveston, Texas, United States
- Rights: Public Domain

Example 2:
Metadata:
- Title: ABC Bulb and Seed Specials for Spring Planting
- Creator: American Bulb Company
- Subject: Cannas, lilies, seedlings, flowers, gardening, spring planting
- Description: A single-page pamphlet promoting seasonal flower seeds and bulbs from the American Bulb Company. It features prices, illustrations of flowers, and is addressed to D. W. Kempner.
- Publisher: Rosenberg Library
- Contributor: Kempner, Daniel W. (Daniel Webster), 1877–1956
- Date: 1950
- Type: Text
- Format: Pamphlet; 1 page; 16 x 23 cm; JPEG
- Identifier: ark:/67531/metapth1349947
- Source: Harris and Eliza Kempner Collection, MS 80-0002
- Language: English
- Relation: Part of the Personal Papers (MS 80-0002) series
- Coverage: United States
- Rights: Public Domain
"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical pamphlets following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: pamphlet title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-maverick:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "pamphlets_metadata_llama.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'pamphlets_metadata_llama.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Pamphlet1

📂 Processing: Pamphlet10

📂 Processing: Pamphlet11

📂 Processing: Pamphlet12

📂 Processing: Pamphlet13

📂 Processing: Pamphlet14

📂 Processing: Pamphlet15

📂 Processing: Pamphlet16

📂 Processing: Pamphlet17

📂 Processing: Pamphlet18

📂 Processing: Pamphlet19

📂 Processing: Pamphlet2

📂 Processing: Pamphlet20

📂 Processing: Pamphlet21

📂 Processing: Pamphlet22

📂 Processing: Pamphlet23

📂 Processing: Pamphlet24

📂 Processing: Pamphlet25

📂 Processing: Pamphlet26

📂 Processing: Pamphlet27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'pamphlets_metadata_llama.csv'


In [2]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Letter.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Letter")):
    base_folder = os.path.join(extract_path, "Letter")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Letter from Will Clayton to H. Kempner, July 15, 1943
- Creator: Clayton, Will L. (Author)
- Contributor: Kempner, Harris L. (Recipient)
- Date: 1943-07-15
- Content Description: A letter discussing cotton transactions and financial arrangements addressed to Harris Kempner.
- Subject and Keywords: business correspondence, cotton industry, financial matters
- Coverage: United States - Texas - Galveston County - Galveston

Example 2:
Metadata:
- Title: Letter from D. W. Kempner to Harris Kempner, March 25, 1940
- Creator: Kempner, Daniel Webster (Author)
- Contributor: Kempner, Harris L. (Recipient)
- Date: 1940-03-25
- Content Description: A letter about business operations and family financial matters sent from Daniel Webster Kempner to Harris Kempner.
- Subject and Keywords: business affairs, family correspondence, financial management
- Coverage: United States - Texas - Galveston County - Galveston

Example 3:
Metadata:
- Title: Letter from Moore and McKinley to Harris Kempner, May 5, 1941
- Creator: Moore and McKinley (Author)
- Contributor: Kempner, Harris L. (Recipient)
- Date: 1941-05-05
- Content Description: A letter from Moore and McKinley firm regarding banking operations and cotton trade issues directed to Harris Kempner.
- Subject and Keywords: business letters, banking, cotton trade
- Coverage: United States - Texas - Galveston County - Galveston
"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical letters following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Letter from A to B, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-maverick:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "letters_metadata_llama.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'letters_metadata_llama.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Letter 1

📂 Processing: Letter 10

📂 Processing: Letter 11

📂 Processing: Letter 12

📂 Processing: Letter 13

📂 Processing: Letter 14

📂 Processing: Letter 15

📂 Processing: Letter 16

📂 Processing: Letter 17

📂 Processing: Letter 18

📂 Processing: Letter 19

📂 Processing: Letter 2

📂 Processing: Letter 20

📂 Processing: Letter 21

📂 Processing: Letter 22

📂 Processing: Letter 23

📂 Processing: Letter 24

📂 Processing: Letter 25

📂 Processing: Letter 26

📂 Processing: Letter 27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'letters_metadata_llama.csv'


In [6]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Photographs.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Photographs")):
    base_folder = os.path.join(extract_path, "Photographs")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Photograph of a Child Sitting and Holding a Dog
- Creator: Unknown
- Subject and Keywords: children, pets, outdoor activities
- Content Description: A black-and-white photograph showing a young child seated on the ground, smiling while holding a small dog, with a rustic background.
- Date: 19XX
- Coverage: United States
- Format: 1 photograph: ; 8 x 8 cm

Example 2:
Metadata:
- Title: Photograph of a Family Gathering Outdoors
- Creator: Unknown
- Subject and Keywords: family, group portraits, leisure activities
- Content Description: A photograph capturing a family of five posing together in a grassy field, dressed in early 20th-century attire.
- Date: 19XX
- Coverage: Germany - Berlin - Berlin
- Format: 1 photograph: 9 x 14 cm

Example 3:
Metadata:
- Title: Homestead Buildings with Barn and Yard
- Creator: Unknown
- Subject and Keywords: agriculture, homesteads, barns
- Content Description: A rural homestead photograph showing a central muddy ground surrounded by several wooden buildings, indicative of farming activity.
- Date: 19XX
- Coverage: United States
- Format: 1 photograph: 13 x 18 cm
"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical photohraphs following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Photograph title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "photographs_metadata_llama.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'photographs_metadata_llama.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Photograph1

📂 Processing: Photograph10

📂 Processing: Photograph11

📂 Processing: Photograph12

📂 Processing: Photograph13

📂 Processing: Photograph14

📂 Processing: Photograph15

📂 Processing: Photograph16

📂 Processing: Photograph17

📂 Processing: Photograph18

📂 Processing: Photograph19

📂 Processing: Photograph2

📂 Processing: Photograph20

📂 Processing: Photograph21

📂 Processing: Photograph22

📂 Processing: Photograph23

📂 Processing: Photograph24

📂 Processing: Photograph25

📂 Processing: Photograph26

📂 Processing: Photograph27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'photographs_metadata_llama.csv'


In [8]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Image.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Image")):
    base_folder = os.path.join(extract_path, "Image")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Advertisement for Amana Air Conditioner
- Creator: Amana Refrigeration Incorporated
- Subject and Keywords: air conditioning, advertisements, consumer products
- Content Description: A promotional pamphlet detailing technical specifications and product features for Amana brand residential air conditioners.
- Date: 1955~
- Coverage: United States - Iowa - Iowa County - Middle Amana
- Format: 1 page; 28 x 22 cm

Example 2:
Metadata:
- Title: Sugar Chart for the Year 1952
- Creator: H. H. Pike & Son
- Subject and Keywords: sugar trade, historical charts, commerce
- Content Description: A commercial sugar chart showing commodity prices and trends for the year 1952, intended for businesses and traders.
- Date: 1952
- Coverage: United States
- Format: 1 page; 36 x 49 cm

Example 3:
Metadata:
- Title: Membership Notice from Galveston Chamber of Commerce
- Creator: Galveston Chamber of Commerce
- Subject and Keywords: business communications, chamber of commerce, local trade
- Content Description: A printed notice sent to members of the Galveston Chamber of Commerce regarding upcoming meetings and events.
- Date: 1950-09
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 2 pages; 28 cm; 8 x 15 cm
"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Images following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Image title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "images_metadata_llama.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'images_metadata_llama.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Image1

📂 Processing: Image10

📂 Processing: Image2

📂 Processing: Image3

📂 Processing: Image4

📂 Processing: Image5

📂 Processing: Image6

📂 Processing: Image7

📂 Processing: Image8

📂 Processing: Image9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'images_metadata_llama.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Article.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Article")):
    base_folder = os.path.join(extract_path, "Article")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Newspaper Article on Cotton Trade Policies, April 12, 1943
- Creator: The Galveston Daily News (Publisher)
- Subject and Keywords: cotton trade, economic policies, world war II
- Content Description: A newspaper article analyzing shifts in cotton trade regulations during wartime, focusing on Texas and Gulf Coast ports.
- Date: 1943-04-12
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 1 page; 28 x 22 cm

Example 2:
Metadata:
- Title: Editorial on Agricultural Advances, May 7, 1950
- Creator: Houston Chronicle (Publisher)
- Subject and Keywords: agriculture, innovation, editorial opinions
- Content Description: An editorial piece celebrating innovations in farming machinery and their impact on Texas agricultural productivity.
- Date: 1950-05-07
- Coverage: United States - Texas - Harris County - Houston
- Format: 1 page; 30 x 24 cm

"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical article following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Article title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "articles_metadata_llama.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'articles_metadata_llama.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: A1

📂 Processing: a10

📂 Processing: a11

📂 Processing: a2

📂 Processing: a3

📂 Processing: a4

📂 Processing: a5

📂 Processing: a6

📂 Processing: a7

📂 Processing: a8

📂 Processing: a9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'articles_metadata_llama.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Map.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Map")):
    base_folder = os.path.join(extract_path, "Map")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: 1900 Map of Galveston Island Before the Hurricane
- Creator: Texas State Cartographic Survey (Creator)
- Subject and Keywords: maps, galveston, geographic surveys
- Content Description: A historical map illustrating the geography and city structures of Galveston Island before the devastation of the 1900 hurricane.
- Date: 1900
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 1 map; 45 x 60 cm

Example 2:
Metadata:
- Title: Trade Routes in the Gulf of Mexico, 1895
- Creator: United States Department of Commerce (Creator)
- Subject and Keywords: shipping routes, gulf of mexico, maritime trade
- Content Description: A printed map highlighting major shipping and trade routes across the Gulf of Mexico region.
- Date: 1895
- Coverage: Gulf of Mexico
- Format: 1 map; 60 x 75 cm


"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical map following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Map title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "map_metadata_llama.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'map_metadata_llama.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: map1

📂 Processing: map10

📂 Processing: map11

📂 Processing: map12

📂 Processing: map13

📂 Processing: map14

📂 Processing: map15

📂 Processing: map16

📂 Processing: map17

📂 Processing: map18

📂 Processing: map19

📂 Processing: map2

📂 Processing: map20

📂 Processing: map3

📂 Processing: map4

📂 Processing: map5

📂 Processing: map6

📂 Processing: map7

📂 Processing: map8

📂 Processing: map9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'map_metadata_llama.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Paper.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Paper")):
    base_folder = os.path.join(extract_path, "Paper")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Research Paper on Agricultural Export Strategies, 1947
- Creator: Kempner, Harris L. (Author)
- Subject and Keywords: agriculture exports, business strategies, post-war economy
- Content Description: A typewritten research paper discussing approaches to expanding Texas agricultural exports after World War II.
- Date: 1947-09-15
- Coverage: United States - Texas
- Format: 15 pages; 21 x 28 cm

Example 2:
Metadata:
- Title: Personal Letter on Business Challenges, June 2, 1955
- Creator: Kempner, Daniel W. (Author)
- Subject and Keywords: personal correspondence, business hardships, family letters
- Content Description: A personal paper discussing financial challenges and family matters in the cotton industry.
- Date: 1955-06-02
- Coverage: United States - Texas - Galveston County
- Format: 3 pages; 20 x 25 cm


"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical papers following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Paper title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Paper_metadata_llama.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Paper_metadata_llama.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: p1

📂 Processing: p10

📂 Processing: p11

📂 Processing: p12

📂 Processing: p13

📂 Processing: p14

📂 Processing: p2

📂 Processing: p3

📂 Processing: p4

📂 Processing: p5

📂 Processing: p6

📂 Processing: p7

📂 Processing: p8

📂 Processing: p9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Paper_metadata_llama.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Legislative document.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Legislative document")):
    base_folder = os.path.join(extract_path, "Legislative document")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Texas Senate Bill on Cotton Regulation, 1939
- Creator: Texas Legislature (Author)
- Subject and Keywords: legislation, cotton regulation, agriculture policy
- Content Description: A legislative document outlining regulations for cotton production limits and export incentives within Texas.
- Date: 1939-03-21
- Coverage: United States - Texas
- Format: 8 pages; 21 x 30 cm

Example 2:
Metadata:
- Title: Congressional Report on Southern Agriculture, 1945
- Creator: United States Congress (Author)
- Subject and Keywords: congressional reports, southern agriculture, federal support
- Content Description: A report prepared for the U.S. Congress discussing the state of agriculture in the southern states and recommending financial aid.
- Date: 1945-07-10
- Coverage: United States - Southern States
- Format: 20 pages; 22 x 28 cm

"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Legislative document following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Legislative document title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Legislative document_metadata_llama.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Legislative document_metadata_llama.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: ld1

📂 Processing: ld10

📂 Processing: ld11

📂 Processing: ld12

📂 Processing: ld13

📂 Processing: ld14

📂 Processing: ld15

📂 Processing: ld2

📂 Processing: ld3

📂 Processing: ld4

📂 Processing: ld5

📂 Processing: ld6

📂 Processing: ld7

📂 Processing: ld8

📂 Processing: ld9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Legislative document_metadata_llama.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "CLIPPING.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "CLIPPING")):
    base_folder = os.path.join(extract_path, "CLIPPING")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Newspaper Clipping on Cotton Price Surge, March 15, 1951
- Creator: The Dallas Morning News (Publisher)
- Subject and Keywords: cotton prices, market trends, news clippings
- Content Description: A newspaper clipping reporting a significant rise in cotton prices across southern states due to increased international demand.
- Date: 1951-03-15
- Coverage: United States - Texas - Dallas County
- Format: 1 clipping; 28 x 22 cm

Example 2:
Metadata:
- Title: Clipping About Agricultural Technology Adoption, 1950
- Creator: Houston Chronicle (Publisher)
- Subject and Keywords: agricultural innovations, technology adoption, texas farming
- Content Description: A clipped newspaper article showcasing how Texas farmers were adopting modern farming technologies after World War II.
- Date: 1950-06-18
- Coverage: United States - Texas
- Format: 1 clipping; 30 x 24 cm
"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical CLIPPINGS following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: CLIPPING title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "clipping_metadata_llama.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'clipping_metadata_llama.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: clipping1

📂 Processing: clipping10

📂 Processing: clipping11

📂 Processing: clipping12

📂 Processing: clipping13

📂 Processing: clipping14

📂 Processing: clipping15

📂 Processing: clipping16

📂 Processing: clipping17

📂 Processing: clipping18

📂 Processing: clipping19

📂 Processing: clipping2

📂 Processing: clipping20

📂 Processing: clipping21

📂 Processing: clipping22

📂 Processing: clipping23

📂 Processing: clipping24

📂 Processing: clipping25

📂 Processing: clipping26

📂 Processing: clipping27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'clipping_metadata_llama.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Legal Document.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Legal Document")):
    base_folder = os.path.join(extract_path, "Legal Document")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Legal Agreement on Land Sale between Kempner and Associates, 1942
- Creator: Galveston County Court (Author)
- Subject and Keywords: land agreements, property sales, legal documentation
- Content Description: A notarized legal document recording the terms and conditions of a land sale transaction involving members of the Kempner family.
- Date: 1942-11-03
- Coverage: United States - Texas - Galveston County
- Format: 5 pages; 22 x 28 cm

Example 2:
Metadata:
- Title: Court Order Regarding Agricultural Subsidy Dispute, 1947
- Creator: Texas Supreme Court (Author)
- Subject and Keywords: court rulings, agriculture subsidies, legal disputes
- Content Description: A court order resolving a subsidy dispute related to cotton farming in Texas.
- Date: 1947-04-21
- Coverage: United States - Texas
- Format: 7 pages; 21 x 28 cm

"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Legal Documents following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Legal Document title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Legal Document.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Legal Document.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Legal Document1

📂 Processing: Legal Document10

📂 Processing: Legal Document11

📂 Processing: Legal Document12

📂 Processing: Legal Document13

📂 Processing: Legal Document14

📂 Processing: Legal Document15

📂 Processing: Legal Document16

📂 Processing: Legal Document17

📂 Processing: Legal Document18

📂 Processing: Legal Document19

📂 Processing: Legal Document2

📂 Processing: Legal Document20

📂 Processing: Legal Document21

📂 Processing: Legal Document22

📂 Processing: Legal Document23

📂 Processing: Legal Document24

📂 Processing: Legal Document25

📂 Processing: Legal Document26

📂 Processing: Legal Document27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Legal Document.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Newspaper.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Newspaper")):
    base_folder = os.path.join(extract_path, "Newspaper")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Front Page of The Galveston News, October 10, 1940
- Creator: The Galveston News (Publisher)
- Subject and Keywords: world war II news, local news, texas reporting
- Content Description: A full-page newspaper featuring news articles about World War II updates, local events, and market conditions in Galveston.
- Date: 1940-10-10
- Coverage: United States - Texas - Galveston County
- Format: 1 newspaper; 58 x 45 cm

Example 2:
Metadata:
- Title: Sunday Business Section on Cotton Trade, 1949
- Creator: Houston Chronicle (Publisher)
- Subject and Keywords: business news, cotton industry, export trade
- Content Description: Business section articles focused on the cotton trade industry boom post-World War II.
- Date: 1949-05-22
- Coverage: United States - Texas - Harris County
- Format: 1 newspaper; 60 x 48 cm

"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Newspapers following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Newspaper title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Newspaper.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Newspaper.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Newspaper1

📂 Processing: Newspaper10

📂 Processing: Newspaper2

📂 Processing: Newspaper3

📂 Processing: Newspaper4

📂 Processing: Newspaper5

📂 Processing: Newspaper6

📂 Processing: Newspaper7

📂 Processing: Newspaper8

📂 Processing: Newspaper9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Newspaper.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Report.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Report")):
    base_folder = os.path.join(extract_path, "Report")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Annual Report on Agricultural Exports, 1945
- Creator: United States Department of Agriculture (Author)
- Subject and Keywords: agricultural exports, annual reports, postwar recovery
- Content Description: A detailed report summarizing agricultural export trends and trade policies affecting Texas during the postwar period.
- Date: 1945-12-31
- Coverage: United States
- Format: 45 pages; 21 x 28 cm

Example 2:
Metadata:
- Title: Galveston Cotton Shipping Statistics Report, 1948
- Creator: Galveston Cotton Exchange (Author)
- Subject and Keywords: shipping statistics, cotton trade, port records
- Content Description: A statistical report documenting the number of cotton bales shipped from the Port of Galveston during the year 1948.
- Date: 1948-11-15
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 25 pages; 21 x 28 cm

"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Reports following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Report title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Report.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Report.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Report 1

📂 Processing: Report 10

📂 Processing: Report 11

📂 Processing: Report 12

📂 Processing: Report 13

📂 Processing: Report 14

📂 Processing: Report 15

📂 Processing: Report 16

📂 Processing: Report 17

📂 Processing: Report 18

📂 Processing: Report 19

📂 Processing: Report 2

📂 Processing: Report 20

📂 Processing: Report 21

📂 Processing: Report 22

📂 Processing: Report 23

📂 Processing: Report 24

📂 Processing: Report 25

📂 Processing: Report 26

📂 Processing: Report 27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Report.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Script.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Script")):
    base_folder = os.path.join(extract_path, "Script")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Script for Cotton Industry Promotional Film, 1948
- Creator: Galveston Cotton Council (Author)
- Subject and Keywords: cotton trade, promotional films, texas economy
- Content Description: A scripted narration intended for a short promotional film highlighting the importance of the cotton industry in Texas during the late 1940s.
- Date: 1948-05-01
- Coverage: United States - Texas
- Format: 12 pages; 21 x 28 cm

Example 2:
Metadata:
- Title: Radio Broadcast Script on Postwar Trade, 1946
- Creator: United States Department of Commerce (Author)
- Subject and Keywords: radio scripts, postwar economy, trade relations
- Content Description: A typed script for a scheduled radio broadcast discussing postwar trade expansion policies and opportunities in the cotton sector.
- Date: 1946-08-12
- Coverage: United States - Washington, D.C.
- Format: 8 pages; 21 x 28 cm


"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Script following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Script title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Script.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Script.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: script1

📂 Processing: script10

📂 Processing: script11

📂 Processing: script12

📂 Processing: script13

📂 Processing: script14

📂 Processing: script2

📂 Processing: script3

📂 Processing: script4

📂 Processing: script5

📂 Processing: script6

📂 Processing: script7

📂 Processing: script8

📂 Processing: script9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Script.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Poem.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Poem")):
    base_folder = os.path.join(extract_path, "Poem")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Poem "The Cotton Fields of Galveston", 1938
- Creator: Kempner, Harris L. (Poet)
- Subject and Keywords: poetry, cotton fields, galveston
- Content Description: A lyrical poem celebrating the beauty and toil of workers in the cotton fields surrounding Galveston during the early 20th century.
- Date: 1938-04-10
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 1 page; 21 x 28 cm

Example 2:
Metadata:
- Title: "Songs of the Harvest", Collection of Poems, 1942
- Creator: Unknown (Poet)
- Subject and Keywords: harvest poetry, seasonal change, rural life
- Content Description: A collection of short poems reflecting on harvest time, the changing seasons, and agricultural life.
- Date: 1942
- Coverage: United States
- Format: 10 pages; 21 x 28 cm


"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical poems following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: poem title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Poem.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Poem.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Poem 1


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Poem.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Text.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Text")):
    base_folder = os.path.join(extract_path, "Text")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Notes on Cotton Prices in the 1940s
- Creator: Kempner, Daniel W. (Author)
- Subject and Keywords: cotton pricing, handwritten notes, agricultural economy
- Content Description: Handwritten notes analyzing cotton price trends and trading volumes from 1940 to 1949, based on personal observations.
- Date: 1949-08-20
- Coverage: United States - Texas
- Format: 3 pages; 20 x 25 cm

Example 2:
Metadata:
- Title: Typed Notes on Business Strategy
- Creator: Unknown (Author)
- Subject and Keywords: business strategies, operational notes, historical planning
- Content Description: Typed text outlining strategies for managing agricultural export business operations, intended for internal use.
- Date: 1950-05-15
- Coverage: United States
- Format: 5 pages; 21 x 28 cm


"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical texts following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Text title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Text.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Text.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Text 1

📂 Processing: Text 10

📂 Processing: Text 11

📂 Processing: Text 12

📂 Processing: Text 13

📂 Processing: Text 14

📂 Processing: Text 15

📂 Processing: Text 16

📂 Processing: Text 17

📂 Processing: Text 18

📂 Processing: Text 19

📂 Processing: Text 2

📂 Processing: Text 20

📂 Processing: Text 21

📂 Processing: Text 22

📂 Processing: Text 23

📂 Processing: Text 24

📂 Processing: Text 25

📂 Processing: Text 26

📂 Processing: Text 27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Text.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Technical Drawing.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Technical Drawing")):
    base_folder = os.path.join(extract_path, "Technical Drawing")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Technical Blueprint of Cotton Storage Facility, 1943
- Creator: Galveston Cotton Exchange (Engineer)
- Subject and Keywords: cotton storage, technical drawings, architecture
- Content Description: A technical drawing showing architectural specifications for a cotton storage warehouse built in Galveston.
- Date: 1943-11-15
- Coverage: United States - Texas - Galveston County
- Format: 1 blueprint; 60 x 90 cm

Example 2:
Metadata:
- Title: Schematic Plan for Industrial Cotton Gin, 1947
- Creator: Texas Agricultural Engineering Department (Engineer)
- Subject and Keywords: cotton gins, schematics, industrial design
- Content Description: A detailed schematic of machinery layout for a modern cotton gin facility.
- Date: 1947-06-01
- Coverage: United States - Texas
- Format: 1 schematic; 55 x 70 cm


"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Technical Drawings following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Technical Drawingtitle, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Technical Drawing.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Technical Drawing.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Technical Drawing 1

📂 Processing: Technical Drawing 2

📂 Processing: Technical Drawing 3

📂 Processing: Technical Drawing 4

📂 Processing: Technical Drawing 5

📂 Processing: Technical Drawing 6

📂 Processing: Technical Drawing 7


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Technical Drawing.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Artwork.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Artwork")):
    base_folder = os.path.join(extract_path, "Artwork")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Painting "Harvest Day at the Plantation", 1935
- Creator: Johnson, Emily S. (Artist)
- Subject and Keywords: harvest scenes, plantation life, agricultural artwork
- Content Description: An oil painting depicting a lively cotton harvest scene on a Texas plantation.
- Date: 1935
- Coverage: United States - Texas
- Format: 1 painting; 60 x 75 cm

Example 2:
Metadata:
- Title: Sketches of Rural Texas Life, circa 1940
- Creator: Unknown (Artist)
- Subject and Keywords: rural texas, sketchbook, everyday life
- Content Description: A collection of pencil sketches illustrating daily rural life and work activities across Texas towns.
- Date: 1940~
- Coverage: United States - Texas
- Format: 12 sketches; 21 x 28 cm each


"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Artworks following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Artwork title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Artwork.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Artwork.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Artwork1

📂 Processing: Artwork10

📂 Processing: Artwork11

📂 Processing: Artwork12

📂 Processing: Artwork13

📂 Processing: Artwork2

📂 Processing: Artwork3

📂 Processing: Artwork4

📂 Processing: Artwork5

📂 Processing: Artwork6

📂 Processing: Artwork7

📂 Processing: Artwork8

📂 Processing: Artwork9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Artwork.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Collection.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Collection")):
    base_folder = os.path.join(extract_path, "Collection")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Kempner Family Cotton Trade Correspondence Collection, 1920–1950
- Creator: Kempner Family (Collector)
- Subject and Keywords: business correspondence, cotton trade, family collections
- Content Description: A curated collection of letters, contracts, and trade documents related to the cotton trade business handled by the Kempner family.
- Date: 1920/1950
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 200 documents; various sizes

Example 2:
Metadata:
- Title: Collection of Galveston Agricultural Reports, 1930–1945
- Creator: Rosenberg Library Archives (Collector)
- Subject and Keywords: agricultural reports, galveston records, archival collections
- Content Description: A collection of annual agricultural reports documenting the cotton, grain, and livestock industries in Galveston County.
- Date: 1930/1945
- Coverage: United States - Texas - Galveston County
- Format: 50 reports; various formats

"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Collection following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Collection title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Collection.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Collection.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Collection 1

📂 Processing: Collection 2

📂 Processing: Collection 3


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Collection.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "journal.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Journal_Newsletter_Magazine")):
    base_folder = os.path.join(extract_path, "Journal_Newsletter_Magazine")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Kempner News Bulletin, Vol. 1, No. 3, March 1947
- Creator: Harris and Eliza Kempner Fund (Publisher)
- Subject and Keywords: philanthropy news, community projects, galveston
- Content Description: A monthly bulletin featuring updates on community donations, scholarship funds, and philanthropic events sponsored by the Kempner family.
- Date: 1947-03-15
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 8 pages; 22 x 28 cm

Example 2:
Metadata:
- Title: Cotton Trade Monthly Review, May 1949
- Creator: Kempner Cotton Exchange (Publisher)
- Subject and Keywords: cotton market updates, business news, agricultural trade
- Content Description: A periodical reviewing monthly cotton market statistics, pricing fluctuations, and trade news relevant to the Galveston business community.
- Date: 1949-05-01
- Coverage: United States - Texas - Galveston
- Format: 12 pages; 21 x 27 cm

Example 3:
Metadata:
- Title: Kempner Family Quarterly Newsletter, Winter 1950
- Creator: Kempner Family Archives (Publisher)
- Subject and Keywords: family history, community events, heritage preservation
- Content Description: A quarterly newsletter highlighting major events, historical notes, and family gatherings related to the Kempner family.
- Date: 1950-12-01
- Coverage: United States - Texas
- Format: 10 pages; 22 x 28 cm

"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Journal, Newsletter, Magazine following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Journal_Newsletter_Magazine title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Journal_Newsletter_Magazine.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Journal_Newsletter_Magazine.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Letter 1

📂 Processing: Letter 10

📂 Processing: Letter 11

📂 Processing: Letter 12

📂 Processing: Letter 13

📂 Processing: Letter 16

📂 Processing: Letter 17

📂 Processing: Letter 18

📂 Processing: Letter 2

📂 Processing: Letter 20

📂 Processing: Letter 21

📂 Processing: Letter 22


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Journal_Newsletter_Magazine.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "postcards.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "postcards")):
    base_folder = os.path.join(extract_path, "postcards")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Postcard of Broadway Street in Galveston, 1925
- Creator: Unknown (Photographer)
- Subject and Keywords: galveston, street scenes, postcards
- Content Description: A colorized postcard showing a bustling view of Broadway Street lined with historic homes and businesses in Galveston, Texas.
- Date: 1925-07-04
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 1 postcard; 14 x 9 cm

Example 2:
Metadata:
- Title: Postcard Depicting Cotton Shipping Docks, 1930
- Creator: Unknown (Photographer)
- Subject and Keywords: cotton docks, harbor scenes, galveston shipping
- Content Description: A vintage postcard featuring ships docked at Galveston's cotton wharves, busy with workers and cargo.
- Date: 1930
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 1 postcard; 14 x 9 cm

Example 3:
Metadata:
- Title: Greeting Postcard from Galveston Beach, 1942
- Creator: Coastal Views Publishing (Photographer)
- Subject and Keywords: beaches, tourism, postcards
- Content Description: A cheerful beach-themed postcard sent as a summer greeting from Galveston Beach.
- Date: 1942-06-10
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 1 postcard; 14 x 9 cm

"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Postcard following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Postcard title, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "postcards.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'postcards.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Postcard1

📂 Processing: Postcard10

📂 Processing: Postcard11

📂 Processing: Postcard12

📂 Processing: Postcard13

📂 Processing: Postcard14

📂 Processing: Postcard15

📂 Processing: Postcard16

📂 Processing: Postcard17

📂 Processing: Postcard18

📂 Processing: Postcard19

📂 Processing: Postcard2

📂 Processing: Postcard20

📂 Processing: Postcard21

📂 Processing: Postcard22

📂 Processing: Postcard23

📂 Processing: Postcard24

📂 Processing: Postcard25

📂 Processing: Postcard26

📂 Processing: Postcard27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'postcards.csv'


In [None]:
import requests
import zipfile
import os
import base64
import pandas as pd
import re
import json
from PIL import Image
import pytesseract
from google.colab import files

# ===============================
# 🔑 ENTER YOUR OPENROUTER CREDENTIALS
# ===============================
OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ").strip()
REFERER = "https://your-site-url.com"  # Optional
TITLE = "UNTL Metadata Generator"       # Optional for ranking


zip_path = "Physical Object.zip"
extract_path = "/content/documents"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 🛠 Dynamically detect if 'Pamphlet' folder exists inside or not
if os.path.isdir(os.path.join(extract_path, "Physical Object")):
    base_folder = os.path.join(extract_path, "Physical Object")
else:
    base_folder = extract_path

# ===============================
# 🛠️ UTILITY FUNCTIONS
# ===============================
def resize_image(path, max_width=1024):
    img = Image.open(path)
    if img.width > max_width:
        img.thumbnail((max_width, max_width))
        img.save(path, format="JPEG", quality=85)

def extract_text_from_images(image_paths):
    full_text = ""
    for path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(path))
            full_text += text + "\n"
        except Exception as e:
            print(f"❌ OCR failed for {path}: {e}")
    return full_text.strip()

def parse_dublin_core_block(text_block):
    field_map = {}
    pattern = r"[-•*]\s*(?:\*\*|__)?([\w\s]+?)(?:\*\*|__)?:\s*(.+)"
    for line in text_block.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            key, value = match.groups()
            field_map[key.strip().lower().replace(" ", "_")] = value.strip()
    return field_map

few_shot_text = """
Example 1:
Metadata:
- Title: Kempner Family Silver Trophy Cup, 1935
- Creator: Unknown (Silversmith)
- Subject and Keywords: trophies, silverware, family memorabilia
- Content Description: A silver trophy cup awarded to the Kempner family for contributions to local agricultural fairs, engraved with names and dates.
- Date: 1935
- Coverage: United States - Texas - Galveston County
- Format: 1 trophy; 30 cm height

Example 2:
Metadata:
- Title: Commemorative Cotton Bale, 1940
- Creator: Texas Cotton Growers Association (Maker)
- Subject and Keywords: cotton samples, agricultural exhibits, memorabilia
- Content Description: A miniature cotton bale produced for display at the Texas State Fair to showcase cotton production techniques.
- Date: 1940
- Coverage: United States - Texas
- Format: 1 object; 10 x 15 x 20 cm

Example 3:
Metadata:
- Title: Wooden Crate Used for Cotton Shipping, 1928
- Creator: Unknown (Craftsman)
- Subject and Keywords: shipping crates, cotton trade, physical objects
- Content Description: A large wooden crate used for transporting cotton bales from Galveston docks to export destinations.
- Date: 1928
- Coverage: United States - Texas - Galveston County - Galveston
- Format: 1 crate; 60 x 80 x 90 cm

"""

system_prompt = """
You are a metadata generator tasked with producing 7 fields for historical Physical Objects following University of North Texas Libraries Metadata Input Guidelines.

FIELD OUTPUT RULES:
- Output only the 7 Dublin Core fields in plain text (no markdown).
- Format each field like: - Title: [value]
- Output the fields in this exact order: Title, Creator, Contributor, Date, Content Description, Subject and Keywords, Coverage

🛠 FORMATTING INSTRUCTIONS:
- Title must include starting from resource type followed by the name sender to receiver followed by date in "Month Day, Year" format (Example: Physical Object, July 2025).
- Creator and Contributor names must be in "Last, First (Role)" style.
- Dates must be ISO format YYYY-MM-DD.
- Subjects must be 3–6 keywords, lowercase unless proper nouns.
- Coverage should be full geographic hierarchy, e.g., "United States - Texas - Galveston County - Galveston" and include coverage range of dates like 2025-01-02/2025-02-03.
- If unknown, write "Not provided."
- No markdown, no extra explanation — only plain text following example structure.
"""

# ===============================
# 🤖 METADATA GENERATOR USING OPENROUTER
# ===============================
def generate_metadata_from_text(extracted_text):
    prompt = few_shot_text + "\n\nHere is the extracted text from the document:\n" + extracted_text + "\n\nNow write all 15 metadata fields."

    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": prompt.strip()}
    ]

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": REFERER,
        "X-Title": TITLE
    }

    payload = {
        "model": "meta-llama/llama-4-scout:free",
        "messages": messages,
        "temperature": 0.4,
        "max_tokens": 1500
    }

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ OpenRouter API Error: {e}")
        return "[GPT ERROR]"

# ===============================
# 📦 MAIN LOOP TO PROCESS DOCUMENTS
fields = ["title", "creator", "contributor", "date", "content_description", "subject_and_keywords", "coverage"]

rows = []

doc_folders = sorted([
    os.path.join(base_folder, d)
    for d in os.listdir(base_folder)
    if os.path.isdir(os.path.join(base_folder, d))
])[:20]

for folder in doc_folders:
    doc_id = os.path.basename(folder)
    image_files = sorted([
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])[:4]

    if not image_files:
        continue

    print(f"\n📂 Processing: {doc_id}")
    for img in image_files:
        resize_image(img)

    extracted_text = extract_text_from_images(image_files)
    raw_output = generate_metadata_from_text(extracted_text)

    if "[GPT ERROR]" in raw_output:
        row = {f: "" for f in fields}
        row["document_id"] = doc_id
        row["description"] = "[Error occurred]"
        row["raw_output"] = raw_output
        rows.append(row)
        continue

    parsed = parse_dublin_core_block(raw_output)
    row = {f: parsed.get(f, "") for f in fields}
    row["document_id"] = doc_id
    row["raw_output"] = raw_output
    rows.append(row)

# ===============================
# 💾 EXPORT CSV
# ===============================
df = pd.DataFrame(rows)
output_csv = "Physical Object.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

print("\n✅ Metadata generation completed and saved to 'Physical Object.csv'")

Enter your OpenRouter API Key: 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

📂 Processing: Physicalobject1

📂 Processing: Physicalobject10

📂 Processing: Physicalobject11

📂 Processing: Physicalobject2

📂 Processing: Physicalobject3

📂 Processing: Physicalobject4

📂 Processing: Physicalobject5

📂 Processing: Physicalobject6

📂 Processing: Physicalobject7

📂 Processing: Physicalobject8

📂 Processing: Physicalobject9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Metadata generation completed and saved to 'Physical Object.csv'
