In [1]:
from src.WikidataTextifier import WikidataEntity

earth = WikidataEntity.from_id("Q14784328", external_ids=True)
earth.to_json()

UnboundLocalError: cannot access local variable 'instanceof' where it is not associated with a value

In [33]:
from __future__ import annotations
from datetime import datetime, timezone, timedelta
from dateutil.relativedelta import relativedelta
from babel import Locale
from babel.dates import format_date, format_time
from babel.units import format_unit
from babel.numbers import format_decimal
import requests
import re
from typing import Optional, Dict, Any, Tuple

# --------------------------- parsing & constants ------------------------------

ISO_RX = re.compile(
    r'^(?P<sign>[+-])(?P<year>\d{4,16})-(?P<month>\d{2})-(?P<day>\d{2})T'
    r'(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})Z$'
)

DELTA_BY_PREC = {
    14: relativedelta(seconds=1),
    13: relativedelta(minutes=1),
    12: relativedelta(hours=1),
    11: relativedelta(days=1),
    10: relativedelta(months=1),
    9:  relativedelta(years=1),
    8:  relativedelta(years=10),
    7:  relativedelta(years=100),
    6:  relativedelta(years=1000),
}

def _parse_iso(iso: str) -> Tuple[int,int,int,int,int,int]:
    m = ISO_RX.match(iso)
    if not m:
        raise ValueError(f"Bad ISO time: {iso}")
    sign = -1 if m.group("sign") == "-" else 1
    year = sign * int(m.group("year"))
    return (year,
            int(m.group("month")),
            int(m.group("day")),
            int(m.group("hour")),
            int(m.group("minute")),
            int(m.group("second")))

def _locale(lang: str) -> Locale:
    try:
        return Locale.parse(lang)
    except Exception:
        return Locale.parse("en")

def _apply_tz(dt: datetime, minutes: int) -> datetime:
    return dt.replace(tzinfo=timezone(timedelta(minutes=minutes)))

def _safe_datetime(year: int, month: int, day: int, H: int, M: int, S: int) -> datetime:
    # Python datetime supports 1..9999 only
    y = min(max(abs(year), 1), 9999)
    m = max(month, 1)
    d = max(day, 1)
    return datetime(y, m, d, H, M, S)

# --------------------------- era conversions ----------------------------------

def _astronomical_to_human_era(year: int, loc: Locale) -> Tuple[int, Optional[str]]:
    """
    Astronomical: ..., -1, 0, 1, 2, ...
      0 -> 1 BCE
     -1 -> 2 BCE
      1 -> 1 CE (era omitted)
    """
    eras = getattr(loc, "eras", None) or {}
    labels = eras.get("abbreviated") or eras.get("wide") or {0:"BC",1:"AD"}
    if year <= 0:
        return (1 - year, labels.get(0, "BC"))
    return (year, None)

def _human_from_astro(y: int) -> tuple[int, bool]:
    return (1 - y, True) if y <= 0 else (y, False)

def _astro_from_human(h: int, is_bce: bool) -> int:
    return 1 - h if is_bce else h

# --------------------------- number formatting --------------------------------

def _format_year_number(n: int, loc: Locale, use_locale_digits_for_year: bool) -> str:
    """Render a year number without grouping; optionally use locale digit scripts."""
    if use_locale_digits_for_year:
        try:
            return format_decimal(n, locale=loc, group_separator=False, decimal_quantization=False)
        except TypeError:
            return format_decimal(n, locale=loc, grouping=False, decimal_quantization=False)
    return str(n)

# --------------------------- caches & resolvers -------------------------------

_CAL_LABEL_CACHE: Dict[tuple, Optional[str]] = {}
_UNIT_LABEL_CACHE: Dict[tuple, Optional[str]] = {}

# Canonical & composite temporal units (Wikidata QIDs)
UNIT_QIDS = {
    "year": "Q577",
    "decade": "Q39911",
    "century": "Q578",
    "millennium": "Q36507",
    # Deep-time composites:
    "hundred_thousand_years": "Q24004476",
    "million_years": "Q24004475",
    "billion_years": "Q24004466",
}

def wikidata_calendar_label_resolver(calendarmodel_uri: str, lang: str) -> Optional[str]:
    """
    Fetch the calendar label for the calendarmodel (QID in the URI) in `lang`.
    Suppress Gregorian (Q1985727).
    """
    if calendarmodel_uri.endswith("Q1985727"):
        return None
    key = (calendarmodel_uri, lang)
    if key in _CAL_LABEL_CACHE:
        return _CAL_LABEL_CACHE[key]

    qid = calendarmodel_uri.rsplit("/", 1)[-1]
    try:
        url = "https://www.wikidata.org/w/api.php"
        params = {
            "action": "wbgetentities",
            "format": "json",
            "ids": qid,
            "languages": lang,
            "props": "labels",
            "normalize": 1,
        }
        r = requests.get(url, params=params, timeout=5)
        r.raise_for_status()
        data = r.json()
        ent = data.get("entities", {}).get(qid, {})
        labels = ent.get("labels", {})
        if lang in labels:
            _CAL_LABEL_CACHE[key] = labels[lang]["value"]
            return _CAL_LABEL_CACHE[key]
        if lang != "en" and "en" in labels:
            _CAL_LABEL_CACHE[key] = labels["en"]["value"]
            return _CAL_LABEL_CACHE[key]
    except Exception:
        pass

    _CAL_LABEL_CACHE[key] = None
    return None

def wikidata_unit_label_resolver(qid: str, lang: str) -> Optional[str]:
    """Fetch the localized label for a unit (cached)."""
    key = (qid, lang)
    if key in _UNIT_LABEL_CACHE:
        return _UNIT_LABEL_CACHE[key]
    try:
        url = "https://www.wikidata.org/w/api.php"
        params = {
            "action": "wbgetentities",
            "format": "json",
            "ids": qid,
            "languages": lang,
            "props": "labels",
            "normalize": 1,
        }
        r = requests.get(url, params=params, timeout=5)
        r.raise_for_status()
        data = r.json()
        ent = data.get("entities", {}).get(qid, {})
        lbls = ent.get("labels", {})
        if lang in lbls:
            _UNIT_LABEL_CACHE[key] = lbls[lang]["value"]
            return _UNIT_LABEL_CACHE[key]
        if lang != "en" and "en" in lbls:
            _UNIT_LABEL_CACHE[key] = lbls["en"]["value"]
            return _UNIT_LABEL_CACHE[key]
    except Exception:
        pass
    _UNIT_LABEL_CACHE[key] = None
    return None

def get_temporal_unit_label(unit_key: str, lang: str) -> Optional[str]:
    qid = UNIT_QIDS.get(unit_key)
    if not qid:
        return None
    return wikidata_unit_label_resolver(qid, lang)

# --------------------------- range helpers ------------------------------------

def _range_label(loc: Locale, y_start: int, y_end: int, use_locale_digits_for_year: bool) -> str:
    hs, es = _astronomical_to_human_era(y_start, loc)
    he, ee = _astronomical_to_human_era(y_end,   loc)
    a = _format_year_number(hs, loc, use_locale_digits_for_year)
    b = _format_year_number(he, loc, use_locale_digits_for_year)
    if (es or ee) and (es != ee):
        left  = f"{a} {es}" if es else a
        right = f"{b} {ee}" if ee else b
        return f"{left}–{right}"
    if es:
        return f"{a}–{b} {es}"
    return f"{a}–{b}"

def _bucket_bounds_astrological(y: int, precision: int) -> tuple[int, int]:
    human, is_bce = _human_from_astro(y)
    width = 10 if precision == 8 else 100 if precision == 7 else 1000
    base = (human // width) * width
    low_h, high_h = base, base + width - 1
    if is_bce:
        start_a = _astro_from_human(high_h, True)
        end_a   = _astro_from_human(low_h,  True)
    else:
        start_a = _astro_from_human(low_h,  False)
        end_a   = _astro_from_human(high_h, False)
    return start_a, end_a

# --------------------------- deep-time formatter ------------------------------

def _format_geologic(loc: Locale, abs_year: int, lang: str, zh_bce_prefix: bool = False) -> str:
    """
    Deep time (precision 0..5).
    1) If exact multiple of 1e9/1e6/1e5 and a WD label exists → "<N> <unit> <BCE>".
       Special-case: exactly 100,000 → prefer "<100,000> <year(s)> <BCE>".
    2) Otherwise use CLDR "<N years>" (already localized) + BCE.
       WD 'year' fallback only if CLDR returned an English phrase for a non-English locale.
    """
    bce = (getattr(loc, "eras", None) or {}).get("abbreviated", {}).get(0, "BC")

    def zh_join(num_unit: str, era: str) -> str:
        if zh_bce_prefix and (loc.language or "").startswith("zh"):
            return f"{era} {num_unit}"
        return f"{num_unit} {era}"

    scales = [
        (1_000_000_000, "billion_years"),
        (1_000_000,     "million_years"),
        (100_000,       "hundred_thousand_years"),
    ]

    for factor, key in scales:
        if abs_year % factor == 0:
            count = abs_year // factor

            if factor == 100_000 and count == 1:
                cldr_phrase_local = format_unit(abs_year, "year", locale=str(loc))
                cldr_phrase_en = format_unit(abs_year, "year", locale="en")

                is_non_en = (loc.language or "").lower() != "en"
                looks_english = (cldr_phrase_local == cldr_phrase_en) or bool(re.search(r"\byear(s)?\b", cldr_phrase_local, re.I))

                if is_non_en and looks_english:
                    wd_year = get_temporal_unit_label("year", lang)
                    try:
                        num_str = format_decimal(abs_year, locale=loc, decimal_quantization=False, group_separator=True)
                    except TypeError:
                        num_str = format_decimal(abs_year, locale=loc, decimal_quantization=False, grouping=True)
                    unit_word = wd_year if wd_year else ("year" if abs_year == 1 else "years")
                    return zh_join(f"{num_str} {unit_word}", bce)

                if (loc.language or "").startswith("zh") and zh_bce_prefix:
                    return f"{bce} {cldr_phrase_local}"
                return f"{cldr_phrase_local} {bce}"

            unit_label = get_temporal_unit_label(key, lang)
            if unit_label:
                try:
                    n_str = format_decimal(count, locale=loc, decimal_quantization=False, group_separator=True)
                except TypeError:
                    n_str = format_decimal(count, locale=loc, decimal_quantization=False, grouping=True)
                return zh_join(f"{n_str} {unit_label}", bce)
            break  # fall through to plain-years formatting

    cldr_phrase_local = format_unit(abs_year, "year", locale=str(loc))
    cldr_phrase_en = format_unit(abs_year, "year", locale="en")
    is_non_en = (loc.language or "").lower() != "en"
    looks_english = (cldr_phrase_local == cldr_phrase_en) or bool(re.search(r"\byear(s)?\b", cldr_phrase_local, re.I))

    if is_non_en and looks_english:
        wd_year = get_temporal_unit_label("year", lang)
        try:
            num_str = format_decimal(abs_year, locale=loc, decimal_quantization=False, group_separator=True)
        except TypeError:
            num_str = format_decimal(abs_year, locale=loc, decimal_quantization=False, grouping=True)
        unit_word = wd_year if wd_year else ("year" if abs_year == 1 else "years")
        return zh_join(f"{num_str} {unit_word}", bce)

    if (loc.language or "").startswith("zh") and zh_bce_prefix:
        return f"{bce} {cldr_phrase_local}"
    return f"{cldr_phrase_local} {bce}"

# --------------------------- point/date formatter -----------------------------

def _format_point(
    loc: Locale,
    dt: datetime,
    precision: int,
    use_locale_digits_for_year: bool
) -> str:
    if precision >= 12:
        date_str = format_date(dt, format="long", locale=loc)
        if precision == 14:
            time_str = format_time(dt, format="HH:mm:ss", locale=loc)
        elif precision == 13:
            time_str = format_time(dt, format="HH:mm", locale=loc)
        else:  # hour
            time_str = format_time(dt, format="HH", locale=loc)
        return f"{date_str} {time_str}"

    if precision == 11:
        return format_date(dt, format="long", locale=loc)

    if precision == 10:
        try:
            from babel.dates import format_skeleton
            return format_skeleton("yMMMM", dt, locale=loc)
        except Exception:
            return format_date(dt, format="MMMM y", locale=loc)

    if precision == 9:
        y = dt.year if dt.year > 0 else 1
        return _format_year_number(y, loc, use_locale_digits_for_year)

    # 8..6 handled as ranges; 5..0 handled by _format_geologic
    return ""

# --------------------------- public API ---------------------------------------

def wikidata_time_to_text(
    value: Dict[str, Any],
    lang: str = "en",
    calendar_label_resolver: Optional[callable] = None,
    show_timezone: bool = True,
    use_locale_digits_for_year: bool = True,
    zh_bce_prefix: bool = False,  # set True if you want 公元前 first in zh
) -> str:
    """
    Convert a Wikidata time value into natural language text.

    - precision 0..5: deep time (composite units if exact multiple & WD label available;
      else plain-years style). Includes localized BCE.
    - precision 6..8: numeric ranges (millennium/century/decade), BCE-safe. Calendar label added for non-Gregorian.
    - precision 9..11: year/month/day via CLDR patterns (year digits switchable).
    - precision 12..14: date + time (CLDR). Optional UTC offset if timezone ≠ 0.
    """
    loc = _locale(lang)
    y, m, d, H, M, S = _parse_iso(value["time"])
    tzmin = int(value.get("timezone", 0) or 0)
    before = int(value.get("before", 0) or 0)
    after  = int(value.get("after", 0) or 0)
    precision = int(value.get("precision", 11) or 11)
    calendarmodel = value.get("calendarmodel", "")

    base = _apply_tz(_safe_datetime(y, m, d, H, M, S), tzmin)

    # Deep time
    if precision <= 5:
        return _format_geologic(loc, abs(y), lang, zh_bce_prefix=zh_bce_prefix)

    # Decade / Century / Millennium
    if precision in (8, 7, 6):
        a, b = _bucket_bounds_astrological(y, precision)
        label = _range_label(loc, a, b, use_locale_digits_for_year)
        if before or after:
            width = 10 if precision == 8 else 100 if precision == 7 else 1000
            human, is_bce = _human_from_astro(y)
            low_h  = (human // width) * width - after * width
            high_h = (human // width) * width + (width - 1) + before * width
            a = _astro_from_human(high_h if is_bce else low_h, is_bce)
            b = _astro_from_human(low_h if is_bce else high_h, is_bce)
            label = _range_label(loc, a, b, use_locale_digits_for_year)
        if calendar_label_resolver and calendarmodel:
            cal = calendar_label_resolver(calendarmodel, lang)
            if cal:
                label = f"{label} ({cal})"
        return label

    # Year / Month / Day / Time
    if before or after:
        delta = DELTA_BY_PREC[precision]
        start = base - (delta * after) if after else base
        end   = base + (delta * before) if before else base
        s = (
            f"{_format_point(loc, start, precision, use_locale_digits_for_year)}–"
            f"{_format_point(loc, end,   precision, use_locale_digits_for_year)}"
        )
    else:
        s = _format_point(loc, base, precision, use_locale_digits_for_year)

    # Timezone if time is shown and offset ≠ 0
    if show_timezone and precision >= 12 and tzmin != 0:
        sign = "+" if tzmin >= 0 else "-"
        hh, mm = divmod(abs(tzmin), 60)
        s = f"{s} UTC{sign}{hh:02d}:{mm:02d}"

    # Calendar label (dynamic; Gregorian suppressed by resolver)
    if calendar_label_resolver and calendarmodel:
        cal = calendar_label_resolver(calendarmodel, lang)
        if cal:
            s = f"{s} ({cal})"

    return s


# --------------------------- quick demo ---------------------------------------

if __name__ == "__main__":
    # Flip this to True if you want 公元前 prefixed in Chinese outputs
    ZH_BCE_PREFIX = False

    samples = [
        # deep time
        {"time":"-4540000000-00-00T00:00:00Z","timezone":0,"before":0,"after":0,"precision":2,"calendarmodel":"http://www.wikidata.org/entity/Q1985786"},
        # ordinary cases
        {"time":"+2022-11-15T00:00:00Z","timezone":0,"before":0,"after":0,"precision":11,"calendarmodel":"http://www.wikidata.org/entity/Q1985727"},
        {"time":"+1000-00-00T00:00:00Z","timezone":0,"before":0,"after":0,"precision":7,"calendarmodel":"http://www.wikidata.org/entity/Q1985786"},
        {"time":"+2014-01-01T00:00:00Z","timezone":0,"before":0,"after":0,"precision":9,"calendarmodel":"http://www.wikidata.org/entity/Q1985727"},
        {"time":"+2006-07-00T00:00:00Z","timezone":0,"before":0,"after":0,"precision":10,"calendarmodel":"http://www.wikidata.org/entity/Q1985727"},
        {"time":"-0500-01-01T00:00:00Z","timezone":0,"before":0,"after":0,"precision":8,"calendarmodel":"http://www.wikidata.org/entity/Q1985727"},
        {"time":"+1998-07-21T10:32:20Z","timezone":120,"before":0,"after":0,"precision":14,"calendarmodel":"http://www.wikidata.org/entity/Q1985727"},
        # exact multiples for composites
        {"time":"-0100000000-00-00T00:00:00Z","timezone":0,"before":0,"after":0,"precision":2,"calendarmodel":"http://www.wikidata.org/entity/Q1985786"}, # 10 billion
        {"time":"-0010000000-00-00T00:00:00Z","timezone":0,"before":0,"after":0,"precision":2,"calendarmodel":"http://www.wikidata.org/entity/Q1985786"}, # 1 billion
        {"time":"-0005000000-00-00T00:00:00Z","timezone":0,"before":0,"after":0,"precision":2,"calendarmodel":"http://www.wikidata.org/entity/Q1985786"}, # 5 million
        {"time":"-0000100000-00-00T00:00:00Z","timezone":0,"before":0,"after":0,"precision":2,"calendarmodel":"http://www.wikidata.org/entity/Q1985786"}, # 100k
    ]

    langs = ["en", "de", "fr", "ar", "zh", "ru", "hi", "pt", "ja"]
    for v in samples:
        print("\n=== SAMPLE ===", v)
        for L in langs:
            out = wikidata_time_to_text(
                v,
                L,
                calendar_label_resolver=wikidata_calendar_label_resolver,
                use_locale_digits_for_year=True,
                zh_bce_prefix=ZH_BCE_PREFIX,
            )
            print(f"{L} → {out}")

    print("\n--- Resolved unit labels snapshot ---")
    for L in langs:
        print(
            L,
            {
                "billion_years": get_temporal_unit_label("billion_years", L),
                "million_years": get_temporal_unit_label("million_years", L),
                "hundred_thousand_years": get_temporal_unit_label("hundred_thousand_years", L),
                "year": get_temporal_unit_label("year", L),
            }
        )



=== SAMPLE === {'time': '-4540000000-00-00T00:00:00Z', 'timezone': 0, 'before': 0, 'after': 0, 'precision': 2, 'calendarmodel': 'http://www.wikidata.org/entity/Q1985786'}
en → 4,540 million years BC
de → 4.540.000.000 Jahre v. Chr.
fr → 4 540 million d'années av. J.-C.
ar → 4,540,000,000 سنة ق.م
zh → 4,540 百萬年 公元前
ru → 4 540 миллион лет до н. э.
hi → 4,54,00,00,000 वर्ष ईसा-पूर्व
pt → 4.540.000.000 ano a.C.
ja → 4,540 100万年 紀元前

=== SAMPLE === {'time': '+2022-11-15T00:00:00Z', 'timezone': 0, 'before': 0, 'after': 0, 'precision': 11, 'calendarmodel': 'http://www.wikidata.org/entity/Q1985727'}
en → November 15, 2022
de → 15. November 2022
fr → 15 novembre 2022
ar → 15 نوفمبر 2022
zh → 2022年11月15日
ru → 15 ноября 2022 г.
hi → 15 नवंबर 2022
pt → 15 de novembro de 2022
ja → 2022年11月15日

=== SAMPLE === {'time': '+1000-00-00T00:00:00Z', 'timezone': 0, 'before': 0, 'after': 0, 'precision': 7, 'calendarmodel': 'http://www.wikidata.org/entity/Q1985786'}
en → 1000–1099 (proleptic Julian calendar)