<a href="https://colab.research.google.com/github/olexandr7/numismatic_data_tallinn_city_museum/blob/main/numismatic_data_muis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ---------------------------------------------------------------------------------
# Data dictionary for rows / DataFrame columns
#
# RecID
#   Local museum record identifier.
#   From: <lido:lidoRecID lido:type="local">
#
# Link
#   Human-readable MUIS URL constructed from RecID.
#   Format: f"https://www.muis.ee/museaalview/{RecID}"
#
# Muuseumikogu
#   Museum collection / department (e.g. "numismaatika").
#   From: <lido:classification lido:type="muuseumikogu">/<lido:term>
#
# Number
#   Full inventory number of the object.
#   From: <lido:workID lido:type="museaali täisnumber">
#
# Nimetus
#   Object title / designation (e.g. "killing", "mängumärk").
#   From: <lido:titleSet lido:type="museaali nimetus">/<lido:appellationValue>
#
# Olemus
#   "Nature" / type of the object (usually "münt" in this subset).
#   From: <lido:objectWorkType lido:type="olemus">/<lido:term>
#
# Mintimise_dateering
#   Minting date or date range of the object.
#   Source event: first <lido:event> whose <lido:eventType>/<lido:term>
#     is "valmistamine" or "<valmistamine/tekkimine>".
#   Formatting (via extract_date_range):
#     earliest = .//lido:eventDate/lido:date/lido:earliestDate
#     latest   = .//lido:eventDate/lido:date/lido:latestDate
#     - if both present and equal: "earliest"
#     - if both present and differ: "earliest–latest"
#     - if only one present: that one
#     - else: ""
#
# Kogumise_dateering
#   Date or date range of collecting / finding the object.
#   Source event: first <lido:event> whose <lido:eventType>/<lido:term>
#     is "kogumine/leidmine" or "<kogumistegevus>".
#   Uses the same extract_date_range() logic as Mintimise_dateering.
#
# Arheoloogia_dateering
#   Archaeological date as a single year (e.g. "1975").
#   Source event: first <lido:event> whose <lido:eventType>/<lido:term>
#     is "arheoloogilised kaevamised" or "arheoloogia".
#   Logic (extract_archeology_year):
#     - check earliestDate, then latestDate
#     - search the string with regex r"(\d{4})"
#     - return the first 4-digit year found, or "" if none.
#
# Mintimise_kommentaar
#   Comments specifically tied to the minting event.
#   Only read from the chosen minting event:
#     - iterate <lido:eventDescriptionSet>
#     - collect all <lido:sourceDescriptiveNote> texts
#     - if any label is "sündmuse kommentaar" or
#       "sündmuses osalenud objekti kommentaar":
#         collect all <lido:descriptiveNoteValue> in that block
#         join them with "; " into a single string.
#
# Originaal_tüüp
#   Whether the object is treated as "originaal" or "koopia".
#   Default: "originaal".
#   Override: if any <lido:descriptiveNoteValue> in the record
#     contains the substring "koopia" (case-insensitive),
#     then Originaal_tüüp = "koopia".
#
# Kommentaar
#   General object-level comments (not minting-specific, not inscriptions).
#   Source: <lido:objectDescriptionSet> where any
#     <lido:sourceDescriptiveNote label="teksti tüüp"> == "kommentaar".
#   All <lido:descriptiveNoteValue> in those blocks are collected
#   and joined with "; ".
#
# Tekst_objektil
#   Inscriptions / text on the object, optionally with location.
#   Source: <lido:objectDescriptionSet> where
#     <lido:sourceDescriptiveNote label="teksti tüüp"> == "tekst objektil".
#   For each descriptiveNoteValue:
#     - if there is <lido:sourceDescriptiveNote label="asukoht objektil">,
#       format "text (location)";
#     - otherwise just "text".
#   Multiple entries joined with "; ".
#
# Seisund
#   Condition of the object (e.g. "rahuldav", "hea").
#   From: <lido:displayStateEditionWrap>/<lido:displayState>
#
# Tehnika
#   Manufacturing / processing techniques, only the specific terms.
#   Source: <lido:termMaterialsTech lido:type="tehnika">/<lido:term>.
#   For each block:
#     - usually term[0], term[1] are broad hierarchies,
#       term[2].. are actual techniques (e.g. "vermimine").
#     - if len(terms) > 2: keep terms[2:]
#       elif len(terms) > 1: keep terms[1:]
#   Across all blocks, deduplicated while preserving order,
#   then joined with "; ".
#
# Materjal
#   Material(s) of the object, using only the most specific term per block.
#   Source: <lido:termMaterialsTech lido:type="materjal">/<lido:term>.
#   For each block, the last term in the chain is used (e.g. "hõbe", "vask").
#   All such terms are deduplicated (order-preserving) and joined with "; ".
#
# Mõõdud
#   Measurements of the object.
#   From: <lido:measurementType>, <lido:measurementValue>,
#         <lido:measurementUnit>.
#   Formatting:
#     - if all three exist: "Type Value Unit", e.g. "läbimõõt 20.0 mm"
#     - else: just the measurementValue.
#
# Riik_maksevahendid
#   Country of USE / PAYMENT context.
#   Extracted specifically from the <lido:event> whose eventType includes "maksevahendid".
#   We read the first non-empty <lido:eventPlace>/<lido:place lido:politicalEntity="riik">
#   /<lido:namePlaceSet>/<lido:appellationValue> inside THAT event (ignoring "[]").
#
# Final filtering rules (applied on the DataFrame):
#   1) Exclude rows where Nimetus (normalized with strip().casefold())
#      is one of:
#         "rinnaleht", "mängumärk", "mängu märk", "medal", "ripats",
#         "žetoon", "zetoon", "vallimärk", "spordiklubi zetoon",
#         "hotelli raha"
#   2) Exclude rows where Materjal (normalized) is one of:
#         "paber", "papp"
# ---------------------------------------------------------------------------------

In [83]:
import xml.etree.ElementTree as ET
import pandas as pd
import re
#downloading files from Colab
from google.colab import files

# Path to your XML file | just name if stored in Colab
xml_file = "lido2.xml"
ns = {"lido": "http://www.lido-schema.org"}

# Parse the XML
tree = ET.parse(xml_file)
root = tree.getroot()

In [84]:
rows = []

# Helper function
def extract_archeology_year(event, ns):
    """Return a single year (e.g. '1975') from an archaeological <lido:event>."""
    if event is None:
        return ""

    # Try earliestDate first, then latestDate
    paths = [
        ".//lido:eventDate/lido:date/lido:earliestDate",
        ".//lido:eventDate/lido:date/lido:latestDate",
    ]

    for p in paths:
        val = event.findtext(p, namespaces=ns)
        if not val:
            continue

        # Find the first 4-digit year anywhere in the string
        m = re.search(r"(\d{4})", val)
        if m:
            return m.group(1)

    return ""

# Helper function
def find_event_by_terms(record, ns, wanted_terms):
    """
    Return the first <lido:event> where any <lido:eventType>/<lido:term>
    text matches one of wanted_terms (exact string match).
    """
    for ev in record.findall(".//lido:event", ns):
        for t in ev.findall("./lido:eventType/lido:term", ns):
            if t.text and t.text.strip() in wanted_terms:
                return ev
    return None

# Helper function
def extract_date_range(event, ns):
    """Return a date range string from a <lido:event> or '' if no dates."""
    if event is None:
        return ""
    start = event.findtext(".//lido:eventDate/lido:date/lido:earliestDate", namespaces=ns)
    end   = event.findtext(".//lido:eventDate/lido:date/lido:latestDate",  namespaces=ns)
    if start and end:
        return start if start == end else f"{start}–{end}"
    return start or end or ""


for record in root.findall("lido:lido", ns):
    recid = record.findtext(".//lido:lidoRecID[@lido:type='local']", namespaces=ns)
    muuseumikogu = record.findtext(".//lido:classification[@lido:type='muuseumikogu']/lido:term", namespaces=ns)
    number = record.findtext(".//lido:workID[@lido:type='museaali täisnumber']", namespaces=ns)
    nimetus = record.findtext(".//lido:titleSet[@lido:type='museaali nimetus']/lido:appellationValue", namespaces=ns)
    olemus = record.findtext(".//lido:objectWorkType[@lido:type='olemus']/lido:term", namespaces=ns)
    seisund = record.findtext(".//lido:displayStateEditionWrap/lido:displayState", namespaces=ns)

    # --- Minting date (valmistamine/tekkimine) ---
    minting_event = find_event_by_terms(
        record,
        ns,
        wanted_terms={"valmistamine", "<valmistamine/tekkimine>"}
    )
    minting_date = extract_date_range(minting_event, ns)  # "" if missing

    # --- Minting date (valmistamine/tekkimine) ---
    minting_event = find_event_by_terms(
        record,
        ns,
        wanted_terms={"valmistamine", "<valmistamine/tekkimine>"}
    )
    minting_date = extract_date_range(minting_event, ns)

    # --- Collection date (kogumine/leidmine / kogumistegevus) ---
    collect_event = find_event_by_terms(
        record,
        ns,
        wanted_terms={"kogumine/leidmine", "<kogumistegevus>"}
    )
    collection_date = extract_date_range(collect_event, ns)  # "" if missing

    # --- Arheoloogia / arheoloogilised kaevamised date (separate field) ---
    arheo_event = find_event_by_terms(
        record,
        ns,
        wanted_terms={"arheoloogilised kaevamised", "arheoloogia"}
    )
    archeology_date = extract_archeology_year(arheo_event, ns)

# --- Mintimise sündmuse kommentaar (minting-related comments inside the minting event) ---
    mintimise_kommentaar = ""

    if minting_event is not None:
        for evds in minting_event.findall("lido:eventDescriptionSet", ns):

            # Collect all labels in this eventDescriptionSet
            labels = [
                (s.text or "").strip().lower()
                for s in evds.findall("lido:sourceDescriptiveNote", ns)
            ]

            # Accepted minting comment label variants
            if any(lbl in {
                "sündmuse kommentaar",
                "sündmuses osalenud objekti kommentaar"
            } for lbl in labels):

                # Collect all descriptive values in that block
                vals = [
                    (d.text or "").strip()
                    for d in evds.findall("lido:descriptiveNoteValue", ns)
                    if d.text
                ]

                # Join them if multiple
                mintimise_kommentaar = "; ".join(vals)
                break


        # --- Originaal + Originaal_tüüp ---

        originaal_tüüp = "originaal"
        for note in record.findall(".//lido:descriptiveNoteValue", ns):
            if "koopia" in (note.text or "").lower():
                originaal_tüüp = "koopia"
                break

    # --- Kommentaar (all object-level kommentaar) & Tekst_objektil (inscriptions) ---
    kommentaarid = []               # <<< NEW: list of all kommentaar texts
    tekstid_objektil = []
    label_attr = f"{{{ns['lido']}}}label"  # lido:label attribute name

    for ods in record.findall(".//lido:objectDescriptionSet", ns):
        # there can be multiple descriptiveNoteValue children  <<< NEW
        desc_values = [d.text for d in ods.findall("lido:descriptiveNoteValue", ns) if d.text]
        if not desc_values:
            continue

        teksti_tuup = None
        asukoht = None

        for src in ods.findall("lido:sourceDescriptiveNote", ns):
            label = src.get(label_attr)
            text = (src.text or "").strip()
            if label == "teksti tüüp":
                teksti_tuup = text.lower()
            elif label == "asukoht objektil":
                asukoht = src.text.strip() if src.text else None

        # All kommentaar values go into Kommentaar  <<< CHANGED BEHAVIOUR
        if teksti_tuup == "kommentaar":
            kommentaarid.extend(v.strip() for v in desc_values)

        # Inscriptions on the object (see next section)
        if teksti_tuup == "tekst objektil":
            for v in desc_values:
                if asukoht:
                    tekstid_objektil.append(f"{v.strip()} ({asukoht})")
                else:
                    tekstid_objektil.append(v.strip())

    kommentaar = "; ".join(kommentaarid) if kommentaarid else ""
    tekst_objektil = "; ".join(tekstid_objektil) if tekstid_objektil else ""

# --- Tehnika (skip first two terms) ---
    tehnika_terms = []
    for block in record.findall(".//lido:termMaterialsTech[@lido:type='tehnika']", ns):
        terms = [t.text.strip() for t in block.findall("lido:term", ns) if t.text]
        if len(terms) > 2:
            tehnika_terms.extend(terms[2:])
        elif len(terms) > 1:
            tehnika_terms.extend(terms[1:])
    tehnika = "; ".join(dict.fromkeys(tehnika_terms)) if tehnika_terms else None

# --- Materjal (only most specific term) ---
    materjal_terms = []
    for block in record.findall(".//lido:termMaterialsTech[@lido:type='materjal']", ns):
        terms = [t.text.strip() for t in block.findall("lido:term", ns) if t.text]
        if terms:
            materjal_terms.append(terms[-1])
    materjal = "; ".join(dict.fromkeys(materjal_terms)) if materjal_terms else None

# --- Mõõdud ---
    mõõdud_type = record.findtext(".//lido:measurementType", namespaces=ns)
    mõõdud_val = record.findtext(".//lido:measurementValue", namespaces=ns)
    mõõdud_unit = record.findtext(".//lido:measurementUnit", namespaces=ns)
    mõõdud = f"{mõõdud_type} {mõõdud_val} {mõõdud_unit}" if mõõdud_type and mõõdud_val and mõõdud_unit else mõõdud_val

    # --- Riik: country of USE / payment context (from the "maksevahendid" event) ---
    riik = None

    # Find the event that classifies the object as a means of payment (maksevahendid)
    maksevahendid_event = find_event_by_terms(
        record,
        ns,
        wanted_terms={"maksevahendid"}
    )

    source_event = maksevahendid_event  # we now deliberately use this, not minting_event

    if source_event is not None:
        for el in source_event.findall(
            ".//lido:eventPlace/lido:place[@lido:politicalEntity='riik']"
            "/lido:namePlaceSet/lido:appellationValue",
            ns,
        ):
            if el is not None and el.text:
                cand = el.text.strip()
                if cand and cand != "[]":
                    riik = cand
                    break

# --- Link ---
    link = f"https://www.muis.ee/museaalview/{recid}" if recid else None

# ---------------------------------------------------------------------------------------------
    rows.append({
        "RecID": recid,
        "Link": link,
        "Muuseumikogu": muuseumikogu,
        "Number": number,
        "Nimetus": nimetus,
        "Olemus": olemus,
        "Mintimise_dateering": minting_date,
        "Kogumise_dateering": collection_date,
        "Arheoloogia_dateering": archeology_date,
        "Mintimise_kommentaar": mintimise_kommentaar,
        "Originaal_tüüp": originaal_tüüp,
        "Kommentaar": kommentaar,
        "Tekst_objektil": tekst_objektil,
        "Seisund": seisund,
        "Tehnika": tehnika,
        "Materjal": materjal,
        "Mõõdud": mõõdud,
        "Riik_maksevahendid": riik
    })

# Create DataFrame
df = pd.DataFrame(rows)


In [85]:
# --- Filter out tokens, hotel money, medals, etc. ---
exclude_nimetus = [
    "rinnaleht",
    "mängumärk",
    "mängu märk",
    "medal",
    "ripats",
    "žetoon",
    "zetoon",
    "vallimärk",
    "spordiklubi zetoon",
    "hotelli raha",
]

exclude_materials = ["paber", "papp"]

df = df[
    ~df["Nimetus"].str.strip().str.casefold().isin([x.casefold() for x in exclude_nimetus])
    & ~df["Materjal"].fillna("").str.strip().str.casefold().isin([x.casefold() for x in exclude_materials])
]

In [86]:
# Save to CSV
df.to_csv("numismatic_collection_tlm.csv", index=False, encoding="utf-8-sig")
print(f"✅ Done! Saved {len(df)} records to 'numismatic_collection_tlm.csv'")
files.download('numismatic_collection_tlm.csv')

✅ Done! Saved 8291 records to 'numismatic_collection_tlm.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Printing first row
print(df.iloc[0])
# Printing 40 rows
# print(df.head(40).to_string())

In [None]:
# Quick data analysis - all column names followed by its unique values
for col in df.columns:
    unique_vals = df[col].dropna().unique()
    print(f"\n--- {col} ---")
    print(unique_vals)