<a href="https://colab.research.google.com/github/pelagios/llm-lod-enriching-heritage/blob/main/notebooks/data_preparation/data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare Cultural Heritage Data for Named Entity Recognition

1. Install required software libraries
2. Download text from museum website
3. Preprocess text for named entity recognition
4. Save results

## 1. Install required software libraries

In [1]:
import importlib
import subprocess

def safe_import(package_name):
    try:
        return importlib.import_module(package_name)
    except ImportError:
        print(f"📦 {package_name} not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        return importlib.import_module(package_name)

spacy = safe_import("spacy")
langid = safe_import("langid")

In [3]:
import re, unicodedata, json
from typing import List, Dict, Any, Tuple, Optional
import hashlib

import requests

try:
    from google.colab import files
    IN_COLAB = True
except:
    IN_COLAB = False

## 2. Download text from museum website

We use the description of a painting by Claude Monet from the artwork Cleveland Museum website

In [15]:
base_url = "https://openaccess-api.clevelandart.org/api/artworks"
data_source = "CMA"

def fetch_cma(query_string: str) -> List[Dict[str, Any]]:
    response = requests.get(base_url, params={"q": query_string, "skip": 0, "limit": 100}, timeout=30)
    response.raise_for_status()
    artworks = response.json().get("data", [])
    if artworks == []:
        return []
    else:
        return [{"id": artworks[0].get("id"), 
                 "text": (artworks[0].get("description") or "").strip()}]

In [5]:
texts = fetch_cma("monet")
if not texts:
    print("No texts were found. Are you connected to the internet?")
else:
    print("Text found:", texts[0])

Text found: {'id': 135382, 'text': "This painting depicts Monet's first wife, Camille, outside on a snowy day passing by the French doors of their home at Argenteuil. Her face is rendered in a radically bold Impressionist technique of mere daubs of paint quickly applied, just as the snow and trees are defined by broad, broken strokes of pure white and green."}


## 3. Preprocess text for named entity recognition

1. Remove non-text characters, urls and email addresses
2. Detect the language of the text
3. Split the text in sentences and tokens

In [6]:
def cleanup_text(text: str) -> str:
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F]", "", text)
    text = re.sub(r"[ \t\u00A0]+", " ", text)
    text = re.sub(r"https?://\S+", "<URL>", text)
    text = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", "<EMAIL>", text)
    return text

In [7]:
def detect_text_language(text: str) -> Dict[str, Any]:
    return langid.classify(text)[0]

In [18]:
def preprocess_texts(texts: List[Tuple[str, Dict[str, Any]]]) -> List[Dict[str, Any]]:
    nlp = spacy.blank("xx")
    nlp.add_pipe("sentencizer")
    results: List[Dict[str, Any]] = []
    for (text, meta), doc in zip(texts, nlp.pipe([text for text, _ in texts])):
        text_clean = cleanup_text(text)
        lang = detect_text_language(text)
        sents = [{"id": sid, 
                  "start": s.start_char, 
                  "end": s.end_char, 
                  "text": s.text} for sid, s in enumerate(doc.sents)]
        tok2sent = {tok.i: sid for sid, s in enumerate(doc.sents) for tok in s}
        tokens = [{"id": tok.i,
                   "text": tok.text,
                   "start": tok.idx,
                   "end": tok.idx + len(tok.text),
                   "ws": tok.whitespace_ != "",
                   "is_punct": tok.is_punct,
                   "sent_id": tok2sent.get(tok.i)} for tok in doc]
        results.append({"text_original": text,
                        "text_clean": text_clean,
                        "language_id": lang,
                        "sentences": sents,
                        "tokens": tokens,
                        "meta": {**meta,
                                 "char_count": len(text),
                                 "token_count": len(tokens),
                                 "sentence_count": len(sents)}})
    return results

In [8]:
cleaned_texts = [{"id": text["id"], 
                  "text": cleanup_text(text["text"])} for text in texts]
print(cleaned_texts[0])

{'id': 135382, 'text': "This painting depicts Monet's first wife, Camille, outside on a snowy day passing by the French doors of their home at Argenteuil. Her face is rendered in a radically bold Impressionist technique of mere daubs of paint quickly applied, just as the snow and trees are defined by broad, broken strokes of pure white and green."}


In [9]:
identified_texts = [{"id": text["id"], 
                     "text": text["text"],
                     "language_id": detect_text_language(text["text"])} for text in cleaned_texts]
print(identified_texts[0])

{'id': 135382, 'text': "This painting depicts Monet's first wife, Camille, outside on a snowy day passing by the French doors of their home at Argenteuil. Her face is rendered in a radically bold Impressionist technique of mere daubs of paint quickly applied, just as the snow and trees are defined by broad, broken strokes of pure white and green.", 'language_id': 'en'}


In [19]:
pairs = [(text["text"], {"data source": data_source, "id": text["id"]}) for text in texts]
preprocessed_texts = preprocess_texts(pairs)
preprocessed_texts[0]

{'text_original': "This painting depicts Monet's first wife, Camille, outside on a snowy day passing by the French doors of their home at Argenteuil. Her face is rendered in a radically bold Impressionist technique of mere daubs of paint quickly applied, just as the snow and trees are defined by broad, broken strokes of pure white and green.",
 'text_clean': "This painting depicts Monet's first wife, Camille, outside on a snowy day passing by the French doors of their home at Argenteuil. Her face is rendered in a radically bold Impressionist technique of mere daubs of paint quickly applied, just as the snow and trees are defined by broad, broken strokes of pure white and green.",
 'language_id': 'en',
 'sentences': [{'id': 0,
   'start': 0,
   'end': 130,
   'text': "This painting depicts Monet's first wife, Camille, outside on a snowy day passing by the French doors of their home at Argenteuil."},
  {'id': 1,
   'start': 131,
   'end': 324,
   'text': 'Her face is rendered in a radica

## 4. Save results

In [26]:
def save_results(texts):
    json_string = json.dumps(texts, ensure_ascii=False, indent=2)
    hash = hashlib.sha1(json_string.encode("utf-8")).hexdigest()
    output_file_name = f"output_{hash}.json"
    with open(output_file_name, "w", encoding="utf-8") as output_file:
        json.dump(out_api_id, output_file)
        print(f"Saved preprocessed files to {output_file_name}")
        output_file.close()

In [27]:
save_results(preprocessed_texts)

Saved preprocessed files to output_81ebf3de92885303b69457a8089aee73b1fa3a8d.json


## Step 5: Put it all together

Call the functions. The code below provides three different options, depending on the types of calls, and it also includes an option that processes local examples.

Each option can be run independently and saves the output in a file within the Colab notebook.

In [None]:
# ---------------------------
# 4) MAIN (robust demos)
# ---------------------------
def save_records(records: List[Dict[str, str]], source: str = "CMA") -> None:
  if not records:
    raise ValueError("No records found (likely 404 if using the CMA API).")

  pairs = [(r["text"], {"source": source, "id": r["id"]}) for r in records]
  processed_texts = preprocess_texts(pairs)
  json_string = json.dumps(processed_texts, ensure_ascii=False, indent=2)
  # Use hash to prevent overwriting previous work
  hash = hashlib.sha1(json_string.encode("utf-8")).hexdigest()
  print(f"CMA by id: {json_string}")

  output_file = f"output_{hash}.json"
  with open(output_file, "w", encoding="utf-8") as f:
    json.dump(out_api_id, f)
    print(f"Saved preprocessed files to {output_file}")

  if IN_COLAB:
    files.download(output_file)

# Save to
save_records(records)

In [None]:
    # OPTION B: CMA search (take a couple of hits, skip empty descriptions)
if __name__ == "__main__":
  try:
        records = fetch_cma("monet", mode="search", limit=5)
        pairs = [(r["text"], {"source": "CMA", "id": r["id"]}) for r in records if r["text"]]
        pairs = pairs[:2]  # first two with non-empty text
        if pairs:
            out_api_search = preprocess_texts(pairs)
            print("\nCMA search:")
            print(json.dumps(out_api_search, ensure_ascii=False, indent=2))
            # save the results in a local file
            output_file = "output_api_search.json"
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(out_api_search, f)
                print(f"Saved preprocessed files to {output_file}")
        else:
            print("\nCMA search: no descriptions returned for this query.")
  except Exception as e:
        print("API search failed:", e)

In [None]:
    # OPTION C: Local examples
if __name__ == "__main__":
  examples = [
        ("Rome is the capital of Italy.", {"source": "local"}),
        ("Mark Rutte bezocht gisteren Groningen.", {"source": "local"})
    ]
  out_local = preprocess_texts(examples)
  print("\nLocal examples:")
  print(json.dumps(out_local, ensure_ascii=False, indent=2))

# save the results in a local file
output_file = "output_local.json"
with open(output_file, "w", encoding="utf-8") as f:
  json.dump(out_local, f)
  print(f"Results saved in {output_file}")

In [None]:
# OPTION D: Local examples from a txt file (make sure to upload the txt file in the content folder of the Colab Notebook)
if __name__ == "__main__":
  # open a local .txt file as input. remember to change the path and filename to your file.
  with open("input.txt", "r") as f:
    text = f.read()

    # then process the text of the input file
    examples = [(text, {"source": "local"})]
    out_local = preprocess_texts(examples)
    print("\nLocal examples:")
    print(json.dumps(out_local, ensure_ascii=False, indent=2))

# save the results in a local file
output_file = "output_file.json"
with open(output_file, "w", encoding="utf-8") as f:
  json.dump(out_local, f)
  print(f"Results saved in {output_file}")

## Save the results in a local .zip file

The output files are only saved locally in the Google Colab notebook, and will be deleted after the notebook is closed.

Two options are available:
1. If you are only interested in one of the files you generated, you can simply download the individual output file.
2. If you ran all options and want to save all output files, you can download all of them as a zip folder.


In [None]:
# take the output_file generated by your chosen option and download it on your machine.
from google.colab import files
files.download("output_file.json")

In [None]:
# take the output_file saved under content and download it locally as a .zip file
from google.colab import files
!zip -r output_files.zip output_api_id.json output_api_search.json output_local.json output_file.json
files.download("output_files.zip")