In [1]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


CUDA Available: True
GPU Name: Tesla T4


In [3]:
import xml.etree.ElementTree as ET
import pandas as pd
import html
import re
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

In [5]:
df = pd.read_csv("/workspace/discogs_labels_cleaned.csv")

def clean_text(text):
    """
    Clean label-related text while preserving names inside Discogs link formats.
    """
    if not isinstance(text, str):
        return ''

    # Decode HTML entities
    text = html.unescape(text)

    # Remove if the value is numeric
    text = re.sub(r'\[a=\d+\]', '', text)
    text = re.sub(r'\[l=\d+\]', '', text)
    text = re.sub(r'\[r=\d+\]', '', text)

    # Keep name if value is non-numeric
    text = re.sub(r'\[a=([^\]]+)\]', r'\1', text)
    text = re.sub(r'\[l=([^\]]+)\]', r'\1', text)
    text = re.sub(r'\[r=([^\]]+)\]', r'\1', text)

    # Handle [url=xxx]TEXT[/url]
    text = re.sub(r'\[url=[^\]]+\](.*?)\[/url\]', r'\1', text, flags=re.IGNORECASE)

    # Remove [b] bold tags
    text = re.sub(r'\[/?b\]', '', text, flags=re.IGNORECASE)

    # Remove all other bracket tags
    text = re.sub(r'\[\s*\d[\d\s]*\]', '', text)
    text = re.sub(r'\[[0-9A-Za-z \-]{2,15}\]', '', text)

    # Handle HTML links
    if '<' in text and '>' in text:
        soup = BeautifulSoup(text, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)


    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply cleaning function to selected columns
columns_to_clean = ['profile', 'contact', 'name']
for col in columns_to_clean:
    df[col] = df[col].apply(clean_text)

# Preview cleaned results
print("\n=== Sample cleaned data ===")
print(df[['name', 'profile', 'contact']].head(3))



=== Sample cleaned data ===
                    name                                            profile  \
0               Planet E  Carl Craig's classic techno label founded in 1...   
1  Earthtones Recordings  California deep house label founded by Jamie T...   
2     Seasons Recordings  California deep-house label founded by Jamie T...   

                                             contact  
0  Planet E Communications P.O. Box 27218 Detroit...  
1  Seasons Recordings 2236 Pacific Avenue Suite D...  
2  Seasons Recordings Costa Mesa, CA 92627 Owner ...  


In [3]:
import re
import spacy
from spacy.language import Language
from spacy.tokens import Doc
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher

# !python -m spacy download -q en_core_web_sm
nlp = spacy.load("en_core_web_sm")

YEAR   = r"(?:19|20)\d{2}"
DECADE = r"(?:19|20)\d0s"
YEAR_OR_DECADE = rf"(?:{YEAR}s?|{DECADE})"

# Month mapping
MONTH_MAP = {
    "jan":1,"january":1,"feb":2,"february":2,"mar":3,"march":3,"apr":4,"april":4,
    "may":5,"jun":6,"june":6,"jul":7,"july":7,"aug":8,"august":8,"sep":9,"sept":9,
    "september":9,"oct":10,"october":10,"nov":11,"november":11,"dec":12,"december":12
}

_MONTH_YEAR_1 = re.compile(r"\b([A-Za-z]{3,9})\s+((?:19|20)\d{2})\b")
_MONTH_YEAR_2 = re.compile(r"\b((?:19|20)\d{2})\s+([A-Za-z]{3,9})\b")
_MONTH_YEAR_3 = re.compile(r"\b((?:19|20)\d{2})[-/\.](0?[1-9]|1[0-2])\b")

_DECADE_ONLY = re.compile(rf"\b{DECADE}\b", flags=re.IGNORECASE)
_YEAR_ONLY   = re.compile(rf"\b{YEAR}\b")

def _try_parse_month_year(text: str):
    """
    Try to parse month-level granularity. Returns (year, month) or (None, None).
    """
    m = _MONTH_YEAR_1.search(text)
    if m:
        mon, year = m.group(1).lower().strip(". "), int(m.group(2))
        if mon in MONTH_MAP:
            return year, MONTH_MAP[mon]
    m = _MONTH_YEAR_2.search(text)
    if m:
        year, mon = int(m.group(1)), m.group(2).lower().strip(". ")
        if mon in MONTH_MAP:
            return year, MONTH_MAP[mon]
    m = _MONTH_YEAR_3.search(text)
    if m:
        year, mon = int(m.group(1)), int(m.group(2))
        return year, mon
    return None, None

def _parse_decade(s: str):
    """
    Convert '1990s' -> (1990, 1999).
    """
    dm = _DECADE_ONLY.search(s)
    if not dm:
        return None, None
    base = int(dm.group(0)[:4])
    return base, base + 9

# Normalize heterogeneous time expressions to a unified dict
def normalize_time(text: str):
    # Range (including 'since X' -> (X, None))
    s, e = extract_range_from_text(text or "")
    if s:
        # range start/end may be '1990s' or a year
        # prefer year if possible
        # try decade first
        if re.fullmatch(DECADE, str(s), flags=re.IGNORECASE):
            ds, de = _parse_decade(s)
            sY = ds
        else:
            sY = int(re.search(YEAR, s).group(0)) if re.search(YEAR, s) else None

        eY = None
        if e:
            if re.fullmatch(DECADE, str(e), flags=re.IGNORECASE):
                ds, de = _parse_decade(e)
                eY = de
            else:
                eY = int(re.search(YEAR, e).group(0)) if re.search(YEAR, e) else None

        return {
            "start_year": sY, "end_year": eY,
            "start_month": None, "end_month": None,
            "granularity": "range"
        }

    # Month-level (e.g., "March 2004", "2004-03")
    y, m = _try_parse_month_year(text or "")
    if y and m:
        return {
            "start_year": y, "end_year": y,
            "start_month": m, "end_month": m,
            "granularity": "month"
        }

    # Decade (e.g., "1990s")
    ds, de = _parse_decade(text or "")
    if ds:
        return {
            "start_year": ds, "end_year": de,
            "start_month": None, "end_month": None,
            "granularity": "decade"
        }

    # Single year (fallback to your extract_date_from_text)
    d = extract_date_from_text(text or "")
    if d:
        # If d is '1990s' we already handled above; here treat numeric year
        ym = re.search(YEAR, d)
        if ym:
            y = int(ym.group(0))
            return {
                "start_year": y, "end_year": y,
                "start_month": None, "end_month": None,
                "granularity": "year"
            }

    # Nothing found
    return {
        "start_year": None, "end_year": None,
        "start_month": None, "end_month": None,
        "granularity": None
    }

def anchor_strength(sty: int, eny: int, stm: int=None, enm: int=None, granularity: str=None) -> float:
    """
    Simple anchor quality heuristic:
    month > single-year > range (short) > decade > unknown
    """
    if sty is None:
        return 0.0
    if granularity == "month" and stm:
        return 1.0
    if granularity == "year":
        return 0.9
    if granularity == "range":
        # shorter range -> stronger
        if (sty is not None) and (eny is not None):
            span = max(1, eny - sty)
            return max(0.75, 0.95 - 0.02 * span)
        return 0.8
    if granularity == "decade":
        return 0.7
    return 0.6


In [4]:
# EntityRuler for label/org terms
ruler = nlp.add_pipe("entity_ruler", before="ner")

entity_patterns = []

# Common organization suffixes (Records, Music, Label, Inc., Ltd., GmbH, etc.)
org_suffix_words = [
    "Records","Recordings","Music","Label","Studios","Studio","Company","Co.","Enterprises"
]
legal_suffix_words = ["Inc.","Inc","Ltd.","Ltd","GmbH","S.R.L.","SRL","S.A.","SA","BV","LLC"]

for suf in org_suffix_words + legal_suffix_words:
    entity_patterns.append({
        "label": "ORG",
        "pattern": [{"IS_TITLE": True, "OP": "+"}, {"TEXT": suf}]
    })

# Alias markers (aka / also known as) for later rename extraction
entity_patterns += [
    {"label": "ALIAS_MARK", "pattern": [{"LOWER": {"IN": ["aka","a.k.a.","a.k.a"]}}]},
    {"label": "ALIAS_MARK", "pattern": [{"LOWER": "also"}, {"LOWER": "known"}, {"LOWER": "as"}]},
]

# A few high-precision label keywords (as ORG)
entity_patterns += [
    {"label": "ORG", "pattern": [{"LOWER": {"IN": ["label","imprint"]}}]},
    {"label": "ORG", "pattern": [{"LOWER": {"IN": ["record","records","recordings","music"]}}, {"LOWER": "label"}]}
]

ruler.add_patterns(entity_patterns)


In [5]:
# Event extraction pipeline
# spaCy + rules：Founded / Merged / Discontinued / Rename (+ aka)

# Time extraction
# from xxxx to xxxx / xxxx–xxxx
RANGE_PATTERNS = [
    rf"\bfrom\s+({YEAR})\s*(?:–|-|to)\s*(?:{YEAR}|xxxx)\b",
    rf"\b({YEAR})\s*(?:–|-|to)\s*(?:{YEAR}|xxxx)\b",
]
SINCE_PATTERN = rf"\bsince\s+(?:the\s+)?({YEAR_OR_DECADE})\b"

def extract_date_from_text(text: str) -> str:
    m = re.search(rf"\b{YEAR_OR_DECADE}\b", text, flags=re.IGNORECASE)
    return (m.group(0) if m else "")

def extract_range_from_text(text: str):
    t = text
    for pat in RANGE_PATTERNS:
        m = re.search(pat, t, flags=re.IGNORECASE)
        if m:
            start = m.group(1)
            m2 = re.search(rf"(?:–|-|to)\s*((?:{YEAR})|xxxx)\b", t[m.end(1):], flags=re.IGNORECASE)
            end = None
            if m2:
                end_txt = m2.group(1)
                end = end_txt if end_txt.lower() != "xxxx" else None
            return start, end
    # since
    m = re.search(SINCE_PATTERN, t, flags=re.IGNORECASE)
    if m:
        return m.group(1), None
    return None, None

# Rename extraction
# proper nouns/nouns/words with capitalized first letters + several connecting words (of/the/and/records/music/label/recordings)
def extract_proper_name_after(span):
    sent = span.sent
    tokens = list(sent)
    start_i = span.end - sent.start

    out = []
    for tok in tokens[start_i:]:
        if tok.is_space:
            continue
        if tok.is_punct and tok.text not in ["&", "-", ".", "'", "/", "’"]:
            break
        if tok.pos_ in ("PROPN", "NOUN") or re.match(r"[A-Z][\w\.\-&'’/]+$", tok.text):
            out.append(tok.text)
            continue
        if out and tok.lower_ in {"of", "the", "and", "records", "music", "label", "recordings"}:
            out.append(tok.text)
            continue
        if out:
            break

    new_name = " ".join(out).strip()
    new_name = re.sub(r"^[\(\[]|[\)\]]$", "", new_name).strip()
    new_name = re.sub(r"\s*\[\d+\]\s*$", "", new_name).strip()
    return new_name


Doc.set_extension("events", default=[], force=True)


# Triggers(phase/lemma)
PHRASE_TRIGGERS = {
    "Founded": [
        "set up","came into existence","began operations","first appeared",
        "came about","was set up","got started","founded","established","launched"
    ],
    "Merged": [
        "joined forces with","taken over","bought by","was folded into",
        "consolidated into","combined with","blended into","partnered with",
        "was integrated into","acquired by"
    ],
    "Discontinued": [
        "shut down","ceased operations","no longer active","ceased to exist",
        "ceased label operations","ceased all activity","wrapped up","was made defunct","put to rest"
    ],
    "Rename": [
        "renamed to","name changed to","rebranded as","now known as",
        "changed its name to","adopted the name"
    ],
}

LEMMA_TRIGGERS = {
    "Founded": ["found","establish","start","create","form","launch","initiate","inaugurate","originate","kickstart","build","born","release"],
    "Merged": ["merge","acquire","absorb","purchase"],
    "Discontinued": ["discontinue","fold","close","retire","disband","sell","defunct"],
    "Rename": ["rename","rebrand","retitle","redesignate"],
}

phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
for evt_type, phrases in PHRASE_TRIGGERS.items():
    patterns = [nlp.make_doc(p) for p in phrases]
    phrase_matcher.add(evt_type.upper(), patterns)

# aka / also known as / alias
AKA_REGEX = re.compile(
    r"\b(a\.?k\.?a\.?|aka|also\s+known\s+as|alias)\b\s*[:\-]?\s*(?P<new>[\(\[]?[A-Z][\w\.\-&' /]{2,}[\)\]]?)",
    flags=re.IGNORECASE
)

def _dedup(events):
    # Event deduplication
    seen = set()
    out = []
    for e in events:
        key = tuple(sorted(e.items()))
        if key not in seen:
            seen.add(key)
            out.append(e)
    return out

# event_extractor
@Language.component("event_extractor")
def event_extractor(doc):
    events = []

    # First scan "range/since" → directly output Founded(start) / Discontinued(end)
    for sent in doc.sents:
        start, end = extract_range_from_text(sent.text)
        if start:
            events.append({
                "event_type": "Founded",
                "trigger": f"range:{sent.text}",
                "time": start
            })
            if end:
                events.append({
                    "event_type": "Discontinued",
                    "trigger": f"range:{sent.text}",
                    "time": end
                })

    # phrase matching
    matches = phrase_matcher(doc)
    for match_id, start_i, end_i in matches:
        span = doc[start_i:end_i]
        sent = span.sent
        label = nlp.vocab.strings[match_id].capitalize()

        # Non-Rename events must have time
        date = ""
        if label != "Rename":
            # 1) DATE entity
            for ent in sent.ents:
                if ent.label_ == "DATE":
                    date = ent.text
                    break
            # 2) Text year/decade
            if not date:
                date = extract_date_from_text(sent.text)
            # 3) Range starting point
            if not date:
                s, _ = extract_range_from_text(sent.text)
                if s:
                    date = s
            if not date:
                continue

        ev = {"event_type": label, "trigger": span.text, "time": date}
        if label == "Rename":
            new_name = extract_proper_name_after(span)
            if new_name:
                ev["new_name"] = new_name
        events.append(ev)

    # lemma matching
    for sent in doc.sents:
        lemmas = [t.lemma_.lower() for t in sent]
        for evt_type, trig_list in LEMMA_TRIGGERS.items():
            for trig in trig_list:
                if trig in lemmas:
                    date = ""
                    if evt_type != "Rename":
                        for ent in sent.ents:
                            if ent.label_ == "DATE":
                                date = ent.text
                                break
                        if not date:
                            date = extract_date_from_text(sent.text)
                        if not date:
                            s, _ = extract_range_from_text(sent.text)
                            if s:
                                date = s
                        if not date:
                            continue
                    ev = {"event_type": evt_type, "trigger": trig, "time": date}
                    if evt_type == "Rename":
                        try:
                            idx = lemmas.index(trig)
                            span_like = sent[idx: idx+2]
                            new_name = extract_proper_name_after(span_like)
                            if new_name:
                                ev["new_name"] = new_name
                        except Exception:
                            pass
                    events.append(ev)

    # aka / also known as / alias
    for sent in doc.sents:
        for m in AKA_REGEX.finditer(sent.text):
            new_name = m.group("new").strip("()[] ").strip()
            new_name = re.sub(r"\s*\[\d+\]\s*$", "", new_name).strip()
            date = extract_date_from_text(sent.text)
            if not date:
                s, _ = extract_range_from_text(sent.text)
                if s:
                    date = s
            events.append({
                "event_type": "Rename",
                "trigger": m.group(0),
                "time": date,
                "new_name": new_name
            })

    # decade fallback (in/early/late the 1960s etc.)
    for sent in doc.sents:
        m = re.search(rf"\b(in|since|from|around|during|early|late)?\s*(the\s+)?({DECADE})\b", sent.text, flags=re.IGNORECASE)
        if m:
            decade = re.search(DECADE, m.group(0), flags=re.IGNORECASE).group(0)
            if not any(decade == (e.get("time") or "") for e in events):
                events.append({
                    "event_type": "Founded",
                    "trigger": m.group(0),
                    "time": decade
                })

    doc._.events = _dedup(events)
    return doc

# Add component to pipeline (after ner)
nlp.add_pipe("event_extractor", after="ner")


<function __main__.event_extractor(doc)>

In [6]:
# Register custom Doc extension for storing relations
Doc.set_extension("relations", default=[], force=True)

# Define relation extraction pipeline component
@Language.component("relation_extractor")
def relation_extractor(doc):
    relations = []
    for sent in doc.sents:
        subject = None
        verb = None
        object_ = None
        for token in sent:
            if "subj" in token.dep_:
                subject = token.text
            if "obj" in token.dep_:
                object_ = token.text
            if token.pos_ == "VERB":
                verb = token.lemma_
        if subject and verb and object_:
            relations.append({"subject": subject, "verb": verb, "object": object_, "sentence": sent.text})
    doc._.relations = relations
    return doc

# Add relation extractor after event extractor
nlp.add_pipe("relation_extractor", after="event_extractor")

<function __main__.relation_extractor(doc)>

In [7]:
# --- SVO cache patched extractor (keeps your original logic untouched) ---
from spacy.language import Language
from spacy.tokens import Doc

Doc.set_extension("_relations_cache", default=[], force=True)

@Language.component("relation_extractor_patched")
def relation_extractor_patched(doc: Doc) -> Doc:
    """
    Store SVO tuples per sentence so later components (regex matcher / timeline)
    can boost confidence when SVO aligns with triggers.
    """
    cache = []
    for sent in doc.sents:
        subject = verb = object_ = None
        for tok in sent:
            if "subj" in tok.dep_:
                subject = tok.text
            if "obj" in tok.dep_:
                object_ = tok.text
            if tok.pos_ == "VERB":
                verb = tok.lemma_.lower()
        if subject and verb and object_:
            cache.append({
                "sentence": sent.text,
                "subject": subject,
                "verb": verb,
                "object": object_
            })
    doc._._relations_cache = cache
    return doc

# Register into pipeline right after your event_extractor
nlp.add_pipe("relation_extractor_patched", after="event_extractor")


<function __main__.relation_extractor_patched(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [8]:
# --- Regex relation matcher (high-precision patterns) ---
import re
from spacy.language import Language
from spacy.tokens import Doc

Doc.set_extension("regex_relations", default=[], force=True)

# Reuse your YEAR/DECADE helpers and extract_date_from_text/extract_range_from_text if already defined.
# Mini trigger map for event types:
_RE_TRIG = {
    "Founded": re.compile(r"\b(founded|established|set\s+up|formed|created)\b", re.I),
    "Merged": re.compile(r"\b(merged\s+with|acquired|absorbed|took\s+over)\b", re.I),
    "Discontinued": re.compile(r"\b(discontinued|folded|ceased\s+operations|shut\s+down)\b", re.I),
    "Rename": re.compile(r"\b(renamed|rebranded\s+as|changed\s+name\s+to)\b", re.I),
}

@Language.component("regex_relation_matcher")
def regex_relation_matcher(doc: Doc) -> Doc:
    """
    Create extra event candidates from a tiny set of precise regex patterns,
    complementing your rule/phrase triggers.
    """
    out = []
    for sent in doc.sents:
        s = sent.text
        etype = None
        for k, pat in _RE_TRIG.items():
            if pat.search(s):
                etype = k
                break
        if not etype:
            continue

        # Get a time string using your existing helpers (safe if missing)
        t = ""
        s_start, s_end = extract_range_from_text(s)
        if s_start:
            t = s_start
        if not t:
            t = extract_date_from_text(s)

        cand = {
            "event_type": etype,
            "trigger": pat.search(s).group(0) if _RE_TRIG[etype].search(s) else "",
            "time": t,
            "sentence": s
        }
        # Try to capture new_name for Rename
        if etype == "Rename":
            cand["new_name"] = extract_proper_name_after(sent) if 'extract_proper_name_after' in globals() else None

        out.append(cand)

    doc._.regex_relations = out
    return doc

nlp.add_pipe("regex_relation_matcher", after="relation_extractor_patched")


<function __main__.regex_relation_matcher(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [9]:
# Timeline builder with SVO boosting and regex merge
from spacy.language import Language
from spacy.tokens import Doc

# Keep your original normalize_time() and anchor_strength() intact.

Doc.set_extension("timeline", default=[], force=True)

# Map event types to acceptable verb lemmas for SVO alignment
_EVT_VERBS = {
    "Founded": {"found","establish","set","form","create","launch","start"},
    "Merged": {"merge","acquire","absorb","purchase","take"},
    "Discontinued": {"discontinue","fold","close","shut","cease"},
    "Rename": {"rename","rebrand","retitle"}
}

@Language.component("timeline_builder")
def timeline_builder(doc: Doc) -> Doc:
    """
    Merge events from:
      - doc._.events (your existing extractor)
      - doc._.regex_relations (the new regex layer)
    Normalize time, compute confidence with anchor_strength, and boost if SVO aligns.
    """
    cands = []

    # From your existing events
    if hasattr(doc._, "events") and doc._.events:
        for ev in doc._.events:
            t = ev.get("time", "")
            norm = normalize_time(str(t) if t is not None else "")
            sty, eny = norm["start_year"], norm["end_year"]
            stm, enm = norm["start_month"], norm["end_month"]
            gran = norm["granularity"]
            conf = anchor_strength(sty, eny, stm, enm, gran)
            if ev.get("trigger"):
                conf = min(1.0, conf + 0.05)
            cands.append({
                "event_type": ev.get("event_type"),
                "trigger": ev.get("trigger"),
                "new_name": ev.get("new_name"),
                "raw_time": t,
                "start_year": sty, "end_year": eny,
                "start_month": stm, "end_month": enm,
                "granularity": gran,
                "confidence": conf,
                "sentence": ev.get("sentence","")
            })

    # From regex layer
    if hasattr(doc._, "regex_relations") and doc._.regex_relations:
        for ev in doc._.regex_relations:
            t = ev.get("time","")
            norm = normalize_time(str(t) if t is not None else "")
            sty, eny = norm["start_year"], norm["end_year"]
            stm, enm = norm["start_month"], norm["end_month"]
            gran = norm["granularity"]
            conf = anchor_strength(sty, eny, stm, enm, gran) + 0.1  # slight base bonus for precise regex
            cands.append({
                "event_type": ev.get("event_type"),
                "trigger": ev.get("trigger"),
                "new_name": ev.get("new_name"),
                "raw_time": t,
                "start_year": sty, "end_year": eny,
                "start_month": stm, "end_month": enm,
                "granularity": gran,
                "confidence": min(1.0, conf),
                "sentence": ev.get("sentence","")
            })

    # SVO alignment boost (+0.25) if sentence’s verb lemma matches event type verbs
    cache = getattr(doc._, "_relations_cache", [])
    for c in cands:
        sent_text = c.get("sentence") or ""
        if not sent_text and c.get("trigger"):
            # Heuristic: find the sentence containing trigger
            for s in doc.sents:
                if c["trigger"] and c["trigger"] in s.text:
                    sent_text = s.text
                    break
            c["sentence"] = sent_text
        if not sent_text:
            continue
        # find SVO for this sentence
        for item in cache:
            if item["sentence"] == sent_text:
                verb = item["verb"]
                if verb in _EVT_VERBS.get(c["event_type"], set()):
                    c["confidence"] = min(1.0, c["confidence"] + 0.25)
                break

    # Conflict resolution: keep best by (event_type, sentence or trigger)
    best = {}
    for c in cands:
        key = (c["event_type"], c.get("sentence") or c.get("trigger"))
        if key not in best or c["confidence"] > best[key]["confidence"]:
            best[key] = c

    tl = list(best.values())

    # Sort by normalized time then confidence
    def _k(x):
        y = x["start_year"] if x["start_year"] is not None else 99999
        m = x["start_month"] if x["start_month"] is not None else 12
        return (y, m, -x["confidence"])
    tl.sort(key=_k)

    doc._.timeline = tl
    return doc

# Re-register after regex matcher
nlp.add_pipe("timeline_builder", after="regex_relation_matcher")


<function __main__.timeline_builder(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [None]:
# -*- coding: utf-8 -*-
# Export a single review file with columns: [name, profile, true_events].
# All comments are in English as requested.

import pandas as pd
import numpy as np
from pathlib import Path

def export_review_minimal(
    input_file: str,
    output_file: str = "review_minimal.xlsx",
    type_col: str = "event_type",
    name_col: str = "name",
    profile_col: str = "profile",
    k_per_class: int = 5,
    seed: int = 42,
    drop_empty_profile: bool = True
) -> pd.DataFrame:
    """
    Load CSV/Excel, sample exactly k_per_class rows per event type (take all if fewer),
    and export a minimal sheet with only [name, profile, true_events].
    The 'true_events' column is initialized as "[]", to be filled manually as a JSON list.
    """

    # 1) Load source table
    suffix = Path(input_file).suffix.lower()
    if suffix == ".csv":
        df = pd.read_csv(input_file)
    else:
        df = pd.read_excel(input_file)

    # 2) Sanity checks
    for c in [type_col, name_col, profile_col]:
        if c not in df.columns:
            raise ValueError(f"Column '{c}' not found in input data.")

    # 3) Optional: filter out empty profiles
    data = df.copy()
    if drop_empty_profile:
        data = data[data[profile_col].astype(str).str.strip() != ""].copy()

    # 4) Stratified sampling: exactly k per class when possible
    rng = np.random.default_rng(seed)
    parts = []
    for etype, g in data.groupby(type_col, sort=False):
        if len(g) <= k_per_class:
            parts.append(g)
        else:
            parts.append(g.sample(n=k_per_class, random_state=int(rng.integers(0, 1_000_000))))
    sampled = pd.concat(parts, axis=0).reset_index(drop=True)

    # 5) Build minimal schema
    out = sampled[[name_col, profile_col]].copy()
    out.rename(columns={name_col: "name", profile_col: "profile"}, inplace=True)
    out["true_events"] = "[]"  # You will fill JSON array, e.g., [{"time":"1958-01-31"}, {"time":"1974"}]

    # 6) Export a SINGLE file (Excel or CSV based on extension)
    if output_file.lower().endswith(".csv"):
        out.to_csv(output_file, index=False, encoding="utf-8-sig")
    else:
        out.to_excel(output_file, index=False)

    # 7) Brief report
    print(f"Saved → {output_file} | rows={len(out)}")
    return out

export_review_minimal(
     input_file="/workspace/timeline_events.csv", 
     output_file="/workspace/sample_for_annotation.xlsx",
     type_col="event_type",
     name_col="label_name",
     profile_col="profile",
     k_per_class=5,
     seed=20250818
 )


Saved → /workspace/sample_for_annotation.xlsx | rows=20


Unnamed: 0,name,profile,true_events
0,Prima Immergrün,Series created by German budget label [l=Prima...,[]
1,Archive Of Piano Music (WRC),"24-LP series of historic ""recordings"" from the...",[]
2,Attytude Records,[a776]'s label created in 2019.,[]
3,Fresno Four,USA label created for [a4438775].,[]
4,Ionic Records (2),"1960s rock label from Hollywood, California.",[]
5,Vinyllover Recordings,Label for electronic dance music founded in 20...,[]
6,Ena Music Production,Slovakian Eurodance Label\r\nRenamed to [l=Ena...,[]
7,Huvila Mielenrauha,"A villa in Ivalo, Lapland, Finland where [a=Ve...",[]
8,princefansunited.com,In November 2007 an ephemeral protest campaign...,[]
9,Les Découvertes,"Series of promotional CDs, showcasing newcomer...",[]


In [13]:
import pandas as pd
import json, ast, re, difflib
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

ANNOTATION_FILE = "/workspace/sample_for_annotation_annotated.xlsx"  # Manual annotations
TO_PREDICT_FILE = "/workspace/sample_for_annotation.xlsx"           # Data to predict

# Read manual annotations
gold_df = pd.read_excel(ANNOTATION_FILE)
assert {"name","profile","true_events"}.issubset(gold_df.columns)

# Read data to predict and run pipeline
pred_df = pd.read_excel(TO_PREDICT_FILE)
assert {"name","profile"}.issubset(pred_df.columns)

def run_pipeline(text):
    doc = nlp(text if isinstance(text, str) else "")
    return json.dumps(doc._.events, ensure_ascii=False)

pred_df["predicted_events"] = pred_df["profile"].apply(run_pipeline)

# Merge datasets
annotated = gold_df[["name","profile","true_events"]].merge(
    pred_df[["name","predicted_events"]], on="name", how="inner"
)

# Fill NaN values
annotated["true_events"] = annotated["true_events"].fillna("[]")
annotated["predicted_events"] = annotated["predicted_events"].fillna("[]")

print(f"annotated data prepared, total{len(annotated)} records")
print(annotated.head(2))

SIM_TRIG = 0.6 # Similarity threshold for trigger matching
ALLOW_EMPTY_TIME_MATCH = True # Allow events to match even if one has empty time

# Normalize event data to a consistent format.
def normalize_event_list(event_data):
    if isinstance(event_data, str):
        try:
            event_data = ast.literal_eval(event_data)
        except Exception:
            return []
    if isinstance(event_data, list):
        if all(isinstance(e, str) for e in event_data):
            return [{"event_type": e, "time": "", "trigger": ""} for e in event_data]
        elif all(isinstance(e, dict) for e in event_data):
            return event_data
    return []

def fuzzy_match_trigger(t_pred, t_true, thr=SIM_TRIG):
    if not t_pred or not t_true:
        return False
    return difflib.SequenceMatcher(None, str(t_pred).lower(), str(t_true).lower()).ratio() >= thr

def norm_time(t):
    if not t:
        return ("", "")
    t = str(t)
    m_year = re.search(r'\b(19|20)\d{2}\b', t)
    if m_year:
        return ("year", m_year.group(0))
    m_dec1 = re.search(r'\b(19|20)\d0s\b', t)
    if m_dec1:
        return ("decade", m_dec1.group(0))
    m_dec2 = re.search(r'\b([5-9]0)s\b', t)
    if m_dec2:
        return ("decade_short", m_dec2.group(0))
    return ("raw", t.strip())

def loose_time_equal(a, b, allow_empty=ALLOW_EMPTY_TIME_MATCH):
    if not a and not b:
        return True
    if not a or not b:
        return allow_empty
    ka, va = norm_time(a); kb, vb = norm_time(b)
    if (ka, va) == (kb, vb):
        return True
    if ka == "year" and kb in ("decade", "decade_short"):
        return vb.startswith(va[:3])
    if kb == "year" and ka in ("decade", "decade_short"):
        return va.startswith(vb[:3])
    if ka == "raw" and kb == "raw":
        return difflib.SequenceMatcher(None, va.lower(), vb.lower()).ratio() >= 0.75
    return False


def loose_event_equal(pred_e: dict, true_e: dict):
    if pred_e.get("event_type") != true_e.get("event_type"):
        return False
    if not loose_time_equal(pred_e.get("time", ""), true_e.get("time", "")):
        return False
    tp, tt = pred_e.get("trigger", ""), true_e.get("trigger", "")
    if tp and tt and not fuzzy_match_trigger(tp, tt):
        return False
    return True


def collapse_to_types_with_fuzzy_time_trigger(events):
    return [e.get("event_type", "") for e in events if isinstance(e, dict)]

def match_row(pred_events, true_events):
    used_pred = set()
    hits, miss, spurious = [], [], []
    for ti, t in enumerate(true_events):
        found = False
        for pi, p in enumerate(pred_events):
            if pi in used_pred:
                continue
            if loose_event_equal(p, t):
                hits.append((p, t))
                used_pred.add(pi)
                found = True
                break
        if not found:
            miss.append(t)
    for pi, p in enumerate(pred_events):
        if pi not in used_pred:
            spurious.append(p)
    return hits, miss, spurious

# Construct sets and calculate metrics
true_sets, pred_sets, row_matches = [], [], []
for i, row in annotated.iterrows():
    true_events = normalize_event_list(row.get("true_events", []))
    pred_events = normalize_event_list(row.get("predicted_events", []))
    true_types = collapse_to_types_with_fuzzy_time_trigger(true_events)
    pred_types = collapse_to_types_with_fuzzy_time_trigger(pred_events)
    true_sets.append(set(true_types))
    pred_sets.append(set(pred_types))
    hits, miss, spurious = match_row(pred_events, true_events)
    row_matches.append({
        "name": row.get("name", i),
        "profile": row.get("profile", ""),
        "hits": hits, "miss": miss, "spurious": spurious
    })

# Output statistics
print("True Events freq:", Counter([t for s in true_sets for t in s]))
print("Pred Events freq:", Counter([t for s in pred_sets for t in s]))

mlb = MultiLabelBinarizer()
mlb.fit(true_sets + pred_sets)
y_true_bin = mlb.transform(true_sets)
y_pred_bin = mlb.transform(pred_sets)

print("\nPer-class metrics:")
print(classification_report(y_true_bin, y_pred_bin, target_names=mlb.classes_, zero_division=0))
print("Overall accuracy", accuracy_score(y_true_bin, y_pred_bin))

annotated data prepared, total20 records
                           name  \
0               Prima Immergrün   
1  Archive Of Piano Music (WRC)   

                                             profile  \
0  Series created by German budget label [l=Prima...   
1  24-LP series of historic "recordings" from the...   

                                         true_events  \
0  [{"event_type": "Founded", "time": "", "trigge...   
1  [{"event_type": "Founded", "time": "early 20th...   

                                    predicted_events  
0                                                 []  
1  [{"event_type": "Founded", "trigger": "form", ...  
True Events freq: Counter({'Founded': 15, 'Discontinued': 5, 'Merged': 5, 'Rename': 4})
Pred Events freq: Counter({'Founded': 11, 'Rename': 6, 'Discontinued': 6, 'Merged': 5})

Per-class metrics:
              precision    recall  f1-score   support

Discontinued       0.67      0.80      0.73         5
     Founded       1.00      0.73      0.85  

In [16]:
# Enhanced year extraction
def _norm_year(x):
    try:
        if not x:
            return None
        text = str(x).strip().lower()
        if not text:
            return None

        # Extract 4-digit years
        year_match = re.search(r'\b(1[8-9]\d{2}|20[0-5]\d)\b', text)
        if year_match:
            year = int(year_match.group(0))
            if 1800 <= year <= 2050:
                return year

        # Complex time expression handling
        if 'early' in text and ('20th century' in text or 'twentieth century' in text):
            return 1910  # early 20th century -> 1910
        elif 'late' in text and ('20th century' in text or 'twentieth century' in text):
            return 1990  # late 20th century -> 1990
        elif '20th century' in text or 'twentieth century' in text:
            return 1950  # 20th century -> 1950

        # Standard decades (1990s -> 1995)
        decade_match = re.search(r'\b(1[8-9]\d|20[0-4]\d)0s\b', text)
        if decade_match:
            base_year = int(decade_match.group(0)[:4])
            return base_year + 5  # decade midpoint

        # Short decades (80s -> 1985)
        short_decade = re.search(r'\b([5-9]0)s\b', text)
        if short_decade:
            short_num = int(short_decade.group(1))
            if short_num >= 50:
                return 1900 + short_num + 5
            else:
                return 2000 + short_num + 5

        return None
    except Exception:
        return None


# Time accuracy calculation
def time_accuracy_per_row(true_events, pred_events):
    """Calculate time accuracy per row"""
    trueY = [_norm_year(e.get("time", "")) for e in true_events if isinstance(e, dict)]
    predY = [_norm_year(e.get("time", "")) for e in pred_events if isinstance(e, dict)]

    if not trueY and not predY:
        return 1.0, 1.0
    if not trueY or not predY:
        return 0.0, 0.0

    # Use enhanced matching algorithm
    used = set()
    strict_hits = lenient_hits = 0

    for ty in trueY:
        best_score = -1
        best_i = -1
        for i, py in enumerate(predY):
            if i in used:
                continue
            # Calculate matching score
            if ty is None and py is None:
                score = 1.0
            elif ty is None or py is None:
                score = 0.0
            elif ty == py:
                score = 1.0
            elif abs(ty - py) <= 1:
                score = 0.9
            elif abs(ty - py) <= 2:
                score = 0.8
            elif (ty // 10 == py // 10):
                score = 0.7
            else:
                score = 0.0

            if score > best_score:
                best_score = score
                best_i = i

        if best_i >= 0 and best_score > 0:
            used.add(best_i)
            if best_score >= 0.95:
                strict_hits += 1
                lenient_hits += 1
            elif best_score >= 0.7:
                lenient_hits += 1

    denom = len(trueY)
    return (strict_hits / denom if denom > 0 else 0,
            lenient_hits / denom if denom > 0 else 0)


# Event equality check
def enhanced_event_equal(pred_e: dict, true_e: dict):
    """Enhanced event matching function"""
    if pred_e.get("event_type") != true_e.get("event_type"):
        return False

    # Enhanced time matching
    pred_time = pred_e.get("time", "")
    true_time = true_e.get("time", "")
    pred_year = _norm_year(pred_time)
    true_year = _norm_year(true_time)

    # Time matching logic
    if pred_year is None and true_year is None:
        time_match = True
    elif pred_year is None or true_year is None:
        time_match = True  # lenient
    else:
        time_match = (
            pred_year == true_year or
            abs(pred_year - true_year) <= 2 or
            (pred_year // 10 == true_year // 10)
        )

    if not time_match:
        return False

    # Trigger matching (lenient)
    tp, tt = pred_e.get("trigger", ""), true_e.get("trigger", "")
    if tp and tt and not fuzzy_match_trigger(tp, tt, thr=0.4):
        return False

    return True


# Match row
def enhanced_match_row(pred_events, true_events):
    """Enhanced row matching function"""
    used_pred = set()
    hits, miss, spurious = [], [], []

    for ti, t in enumerate(true_events):
        found = False
        for pi, p in enumerate(pred_events):
            if pi in used_pred:
                continue
            if enhanced_event_equal(p, t):
                hits.append((p, t))
                used_pred.add(pi)
                found = True
                break
        if not found:
            miss.append(t)

    for pi, p in enumerate(pred_events):
        if pi not in used_pred:
            spurious.append(p)

    return hits, miss, spurious

# Evaluation Execution
print("\n" + "=" * 60)
print("Time Evaluation Results")
print("=" * 60)

row_matches = []
strict_list = []
lenient_list = []
order_true_list = []
order_pred_list = []

for i, row in annotated.iterrows():
    te = normalize_event_list(row.get("true_events", []))
    pe = normalize_event_list(row.get("predicted_events", []))

    # Event matching
    hits, miss, spurious = enhanced_match_row(pe, te)
    row_matches.append({
        "name": row.get("name", i),
        "hits": hits,
        "miss": miss,
        "spurious": spurious
    })

    # Time accuracy
    s, l = time_accuracy_per_row(te, pe)
    strict_list.append(s)
    lenient_list.append(l)

# Calculate overall metrics
total_true_events = sum(len(normalize_event_list(row.get("true_events", []))) for _, row in annotated.iterrows())
total_pred_events = sum(len(normalize_event_list(row.get("predicted_events", []))) for _, row in annotated.iterrows())
total_hits = sum(len(match["hits"]) for match in row_matches)
total_miss = sum(len(match["miss"]) for match in row_matches)
total_spurious = sum(len(match["spurious"]) for match in row_matches)

print(f"Basic Statistics:")
print(f" True events total: {total_true_events}")
print(f" Predicted events total: {total_pred_events}")
print(f" Prediction/truth ratio: {total_pred_events / total_true_events:.2f}")

# Key enhancement: accuracy metrics based on true event count
adjusted_accuracy = total_hits / total_true_events if total_true_events > 0 else 0
traditional_precision = total_hits / total_pred_events if total_pred_events > 0 else 0
traditional_recall = total_hits / total_true_events if total_true_events > 0 else 0
traditional_f1 = (2 * traditional_precision * traditional_recall /
                  (traditional_precision + traditional_recall)
                  if (traditional_precision + traditional_recall) > 0 else 0)

print(f"\nKey Performance Indicators:")
print(f" Traditional precision: {traditional_precision:.3f}")
print(f" Traditional recall: {traditional_recall:.3f}")
print(f" Traditional F1 score: {traditional_f1:.3f}")
print(f" Adjusted accuracy: {adjusted_accuracy:.3f}")

print(f"\nTime Accuracy:")
avg_strict_time = sum(strict_list) / len(strict_list) if strict_list else 0
avg_lenient_time = sum(lenient_list) / len(lenient_list) if lenient_list else 0
print(f" Strict time matching (exact year): {avg_strict_time:.3f}")
print(f" Lenient time matching (±2 years/same decade): {avg_lenient_time:.3f}")

# Coverage analysis
all_true_times = []
all_pred_times = []

for i, row in annotated.iterrows():
    te = normalize_event_list(row.get("true_events", "[]"))
    pe = normalize_event_list(row.get("predicted_events", "[]"))
    all_true_times.extend([_norm_year(e.get("time", "")) for e in te if isinstance(e, dict)])
    all_pred_times.extend([_norm_year(e.get("time", "")) for e in pe if isinstance(e, dict)])

true_with_year = sum(1 for t in all_true_times if t is not None)
pred_with_year = sum(1 for t in all_pred_times if t is not None)

print(f"\nTime Extraction Coverage:")
print(f" True events with years: {true_with_year}/{len(all_true_times)} "
      f"({true_with_year / len(all_true_times) * 100:.1f}%)")
print(f" Predicted events with years: {pred_with_year}/{len(all_pred_times)} "
      f"({pred_with_year / len(all_pred_times) * 100:.1f}%)")


Time Evaluation Results
Basic Statistics:
 True events total: 44
 Predicted events total: 73
 Prediction/truth ratio: 1.66

Key Performance Indicators:
 Traditional precision: 0.425
 Traditional recall: 0.705
 Traditional F1 score: 0.530
 Adjusted accuracy: 0.705

Time Accuracy:
 Strict time matching (exact year): 0.609
 Lenient time matching (±2 years/same decade): 0.622

Time Extraction Coverage:
 True events with years: 34/44 (77.3%)
 Predicted events with years: 59/73 (80.8%)


In [25]:
# Single-output exporter for Discogs label timelines

import os
import json
import pandas as pd

# Config
INPUT_CSV  = "/workspace/discogs_labels_cleaned.csv"  
OUTPUT_CSV  = "/workspace/timeline_events.csv"
NAME_COL    = "name"

# Helpers
def _run_pipeline(text: str):
    """Run spaCy once and collect timeline + raw events for optional auditing."""
    doc = nlp(text if isinstance(text, str) else "")
    return {
        "timeline": getattr(doc._, "timeline", []),
        "predicted_events": getattr(doc._, "events", []),       
        "regex_relations": getattr(doc._, "regex_relations", []),  
    }

def _safe_json_loads(x):
    try:
        return json.loads(x) if isinstance(x, str) else x
    except Exception:
        return []

def _ensure_columns(df: pd.DataFrame, cols):
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}. Current columns: {list(df.columns)}")

# Load data
if os.path.exists(INPUT_CSV):
    df = pd.read_csv(INPUT_CSV)
else:
    raise FileNotFoundError(f"Input file not found: {INPUT_CSV}")

_ensure_columns(df, [NAME_COL, TEXT_COL])

# Run NLP once per row
pipe_out = df[TEXT_COL].apply(_run_pipeline).apply(pd.Series)
df = pd.concat([df[[NAME_COL, TEXT_COL]], pipe_out], axis=1)

# Expand to long table
rows = []
for _, row in df.iterrows():
    label = row[NAME_COL]
    profile_text = row[TEXT_COL]

    timeline_list = row.get("timeline", [])
    if isinstance(timeline_list, str):
        timeline_list = _safe_json_loads(timeline_list)

    if timeline_list and isinstance(timeline_list, list):
        # Preferred: normalized timeline with confidence
        for ev in timeline_list:
            rows.append({
                "label_name":  label,
                "source":      "timeline",
                "event_type":  ev.get("event_type", ""),
                "trigger":     ev.get("trigger", ""),
                "new_name":    ev.get("new_name", ""),
                "raw_time":    ev.get("raw_time", ""),
                "start_year":  ev.get("start_year", None),
                "start_month": ev.get("start_month", None),
                "end_year":    ev.get("end_year", None),
                "end_month":   ev.get("end_month", None),
                "granularity": ev.get("granularity", ""),
                "confidence":  ev.get("confidence", None),
                "sentence":    ev.get("sentence", ""),
                "profile":     profile_text,
            })
    else:
        # Fallback: raw predicted_events (no normalization), so you still get something
        pred_list = row.get("predicted_events", [])
        if isinstance(pred_list, str):
            pred_list = _safe_json_loads(pred_list)
        if pred_list and isinstance(pred_list, list):
            for ev in pred_list:
                rows.append({
                    "label_name":  label,
                    "source":      "predicted_events",
                    "event_type":  ev.get("event_type", ""),
                    "trigger":     ev.get("trigger", ""),
                    "new_name":    ev.get("new_name", ""),
                    "raw_time":    ev.get("time", ""),
                    "start_year":  None,
                    "start_month": None,
                    "end_year":    None,
                    "end_month":   None,
                    "granularity": "",
                    "confidence":  None,
                    "sentence":    ev.get("sentence", ""),
                    "profile":     profile_text,
                })

# Save ONE file
out_df = pd.DataFrame(rows)
out_df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")

print(f"Saved: {OUTPUT_CSV}")
print(f"Rows: {len(out_df)} | Columns: {list(out_df.columns)}")


✅ Saved: /workspace/timeline_events.csv
Rows: 210950 | Columns: ['label_name', 'source', 'event_type', 'trigger', 'new_name', 'raw_time', 'start_year', 'start_month', 'end_year', 'end_month', 'granularity', 'confidence', 'sentence', 'profile']
