In [None]:

import pandas as pd
import re
from rdflib import Graph, Namespace, Literal
from rdflib.namespace import RDF, RDFS, XSD
from pathlib import Path
from rapidfuzz import process, fuzz
from tqdm.auto import tqdm
import sys

# tqdm settings
pbar_opts = dict(leave=False, file=sys.stdout)

data_dir = Path('.')
PHYS_CSV = data_dir / 'resource-neo4j_import-v2.csv'
ADI_CSV  = data_dir / 'zipcode-adi.csv'
MED_CSV  = data_dir / 'medical_school_dbpedia_properties.csv'
FAC_CSV  = data_dir / 'augmented_facilities_dbpedia.csv'  # New facility augmentation file
OUTPUT   = data_dir / 'healtheqkg_full.ttl'

# Allowed physician credentials for lighter graph
ALLOWED_CREDENTIALS = {'MD', 'DO', 'DPM', 'DDS'}

print('Loading CSV files…')
phys = pd.read_csv(PHYS_CSV, dtype=str)
adi  = pd.read_csv(ADI_CSV,  dtype=str)
med  = pd.read_csv(MED_CSV,  dtype=str)
fac  = pd.read_csv(FAC_CSV,  dtype=str)  # Load facility augmentation data

# Filter physicians by credential column
phys = phys[phys['credential'].isin(ALLOWED_CREDENTIALS)].copy()

# Normalize ZIP codes
phys['zip5'] = phys['zip_code'].str[:5].str.zfill(5)
adi['zip5']  = adi['5_digit_zip_code'].str.zfill(5)

# -----------------------------------------------------------------------------
# 2 ─ Expand ADI 2015-2020
# -----------------------------------------------------------------------------
print('Expanding ADI values 2015-2020…')
adi_rows = []
for z, grp in adi.groupby('zip5'):
    v15 = float(grp.iloc[0]['adi_nat_2015']) if pd.notna(grp.iloc[0]['adi_nat_2015']) else None
    v20 = float(grp.iloc[0]['adi_nat_2020']) if pd.notna(grp.iloc[0]['adi_nat_2020']) else None
    for yr in range(2015, 2020):
        adi_rows.append({'zip5': z, 'year': yr, 'nat': v15})
    adi_rows.append({'zip5': z, 'year': 2020, 'nat': v20})
adi_long = pd.DataFrame(adi_rows)

# -----------------------------------------------------------------------------
# 3a ─ Fuzzy med-school lookup
# -----------------------------------------------------------------------------
med['norm'] = (med['original_name']
    .str.lower()
    .str.replace(r'[^a-z0-9 ]', ' ', regex=True)
    .str.split()
    .str.join(' '))
med_lookup_exact = med.set_index('norm').to_dict('index')
all_norm_names   = list(med_lookup_exact)

def match_med_school(raw: str, cutoff: int = 90):
    key = ' '.join(raw.lower().replace('-', ' ').replace(',', ' ').split())
    if key in med_lookup_exact:
        return med_lookup_exact[key]
    res = process.extractOne(key, all_norm_names, scorer=fuzz.token_sort_ratio)
    return med_lookup_exact[res[0]] if res and res[1] >= cutoff else None

# -----------------------------------------------------------------------------
# 3b ─ Fuzzy facility lookup
# -----------------------------------------------------------------------------
# Normalize facility names for lookup
fac['norm'] = (fac['original_facility']
    .str.lower()
    .str.replace(r'[^a-z0-9 ]', ' ', regex=True)
    .str.split()
    .str.join(' '))
fac_lookup_exact = fac.set_index('norm').to_dict('index')
all_fac_names = list(fac_lookup_exact)

def match_facility(raw: str, cutoff: int = 80):  # Lower cutoff for facilities
    if pd.isna(raw):
        return None
    key = ' '.join(raw.lower().replace('-', ' ').replace(',', ' ').split())
    if key in fac_lookup_exact:
        return fac_lookup_exact[key]
    res = process.extractOne(key, all_fac_names, scorer=fuzz.token_sort_ratio)
    return fac_lookup_exact[res[0]] if res and res[1] >= cutoff else None

# -----------------------------------------------------------------------------
# 4 ─ Namespaces
# -----------------------------------------------------------------------------
BASE   = Namespace('http://healtheqkg.example.org/resource/')
HE     = Namespace('http://healtheqkg.example.org/ontology#')
SCHEMA = Namespace('http://schema.org/')
DBO    = Namespace('http://dbpedia.org/ontology/')  # FIXED: Added /ontology/ to match DBpedia standard

g = Graph()
for prefix, ns in [('he', HE), ('schema', SCHEMA), ('dbo', DBO)]:
    g.bind(prefix, ns)

# -----------------------------------------------------------------------------
# 5 ─ ZIP & PostalAddress + ADI nodes
# -----------------------------------------------------------------------------
print('Creating ZIP, Address, and ADI nodes…')
for zip5 in tqdm(phys['zip5'].unique(), **pbar_opts):
    zip_uri = BASE[f'zip/{zip5}']
    g.add((zip_uri, RDF.type, HE.ZipCodeArea))
    g.add((zip_uri, HE.zipCodeValue, Literal(zip5)))

    addr_uri = BASE[f'postalAddress/{zip5}']
    g.add((addr_uri, RDF.type, SCHEMA.PostalAddress))
    g.add((addr_uri, SCHEMA.postalCode, Literal(zip5)))

print('Adding ADI yearly scores…')
for _, row in tqdm(adi_long.iterrows(), total=len(adi_long), **pbar_opts):
    zip_uri = BASE[f'zip/{row.zip5}']
    adi_uri = BASE[f'adi/{row.zip5}/{row.year}']
    g.add((adi_uri, RDF.type, HE.ADIScore))
    g.add((adi_uri, HE.referenceYear, Literal(str(row.year), datatype=XSD.gYear)))
    if pd.notna(row.nat):
        g.add((adi_uri, HE.nationalScore, Literal(int(round(row.nat)), datatype=XSD.integer)))
    g.add((zip_uri, HE.hasADISummary, adi_uri))

# -----------------------------------------------------------------------------
# 6 ─ Create unique facilities first
# -----------------------------------------------------------------------------
print('Creating facility nodes with augmented data…')
facility_uris = {}  # Cache facility URIs to avoid duplicates

for _, row in tqdm(phys.iterrows(), total=len(phys), desc="Processing facilities", **pbar_opts):
    if pd.notna(row.facility):
        # Create safe facility ID
        safe_fac = re.sub(r"[^a-z0-9_]", "_", str(row.facility).lower())
        fac_uri = BASE[f'facility/{safe_fac}']

        if safe_fac not in facility_uris:
            facility_uris[safe_fac] = fac_uri

            # Add basic facility info
            g.add((fac_uri, RDF.type, HE.Facility))
            g.add((fac_uri, RDFS.label, Literal(str(row.facility))))
            g.add((fac_uri, SCHEMA.name, Literal(str(row.facility))))

            # Add location info
            g.add((fac_uri, SCHEMA.address, BASE[f'postalAddress/{row.zip5}']))
            if pd.notna(row.city):
                g.add((fac_uri, SCHEMA.addressLocality, Literal(row.city)))
            if pd.notna(row.state):
                g.add((fac_uri, SCHEMA.addressRegion, Literal(row.state)))

            # Look up augmented facility data
            fac_match = match_facility(row.facility)
            if fac_match:
                # Add DBpedia URI reference
                if pd.notna(fac_match.get('dbpedia_uri')):
                    g.add((fac_uri, SCHEMA.sameAs, Literal(fac_match['dbpedia_uri'])))

                # Add abstract/description
                if pd.notna(fac_match.get('abstract')):
                    g.add((fac_uri, DBO.abstract, Literal(fac_match['abstract'])))
                    g.add((fac_uri, SCHEMA.description, Literal(fac_match['abstract'])))

                # Add bed count
                if pd.notna(fac_match.get('numberOfBeds')):
                    try:
                        beds = int(float(fac_match['numberOfBeds']))
                        g.add((fac_uri, DBO.numberOfBeds, Literal(beds, datatype=XSD.integer)))
                        g.add((fac_uri, HE.bedCount, Literal(beds, datatype=XSD.integer)))
                    except:
                        pass

                # Add employee count
                if pd.notna(fac_match.get('numberOfEmployees')):
                    try:
                        employees = int(float(fac_match['numberOfEmployees']))
                        g.add((fac_uri, DBO.numberOfEmployees, Literal(employees, datatype=XSD.integer)))
                        g.add((fac_uri, SCHEMA.numberOfEmployees, Literal(employees, datatype=XSD.integer)))
                    except:
                        pass

                # Add facility type if available
                if pd.notna(fac_match.get('facility_type')):
                    g.add((fac_uri, SCHEMA.additionalType, Literal(fac_match['facility_type'])))

                # Add website if available
                if pd.notna(fac_match.get('website')):
                    g.add((fac_uri, SCHEMA.url, Literal(fac_match['website'])))

# -----------------------------------------------------------------------------
# 7 ─ Physicians & JobPlacements
# -----------------------------------------------------------------------------
print('Building physicians and placements…')
for _, row in tqdm(phys.iterrows(), total=len(phys), **pbar_opts):
    # clean year string
    year_str = None
    if pd.notna(row.year):
        try:
            year_str = str(int(float(row.year)))
        except Exception:
            year_str = None

    phys_uri = BASE[f'physician/{row.npi}']
    g.add((phys_uri, RDF.type, SCHEMA.Physician))
    g.add((phys_uri, SCHEMA.usNPI, Literal(row.npi)))

    # givenName
    if pd.notna(row.first_name):
        g.add((phys_uri, SCHEMA.givenName, Literal(str(row.first_name).title())))
    # familyName
    if pd.notna(row.last_name):
        g.add((phys_uri, SCHEMA.familyName, Literal(str(row.last_name).title())))

    if pd.notna(row.gender):
        g.add((phys_uri, SCHEMA.gender, Literal(row.gender)))
    g.add((phys_uri, HE.credential, Literal(row.credential)))
    g.add((phys_uri, SCHEMA.medicalSpecialty, Literal(row.specialty)))
    if year_str:
        g.add((phys_uri, HE.graduationYear, Literal(year_str, datatype=XSD.gYear)))

    # Link to facility (if exists)
    if pd.notna(row.facility):
        safe_fac = re.sub(r"[^a-z0-9_]", "_", str(row.facility).lower())
        fac_uri = facility_uris.get(safe_fac)

        if fac_uri:
            # Job placement relationship
            jp_suffix = year_str if year_str else ''
            jp_uri = BASE[f'placement/{row.npi}_{safe_fac}_{jp_suffix}']
            g.add((jp_uri, RDF.type, HE.JobPlacement))
            g.add((jp_uri, HE.hasParticipant, phys_uri))
            g.add((jp_uri, HE.worksAt, fac_uri))
            if year_str:
                g.add((jp_uri, HE.hasTimeReference, BASE[f'year/{year_str}']))
    else:
        # Fallback to ZIP-based facility for physicians without facility name
        fac_uri = BASE[f'facility/zip_{row.zip5}']
        g.add((fac_uri, RDF.type, HE.Facility))
        g.add((fac_uri, SCHEMA.address, BASE[f'postalAddress/{row.zip5}']))
        g.add((fac_uri, RDFS.label, Literal(f"Facility at ZIP {row.zip5}")))

        jp_suffix = year_str if year_str else ''
        jp_uri = BASE[f'placement/{row.npi}_zip{row.zip5}_{jp_suffix}']
        g.add((jp_uri, RDF.type, HE.JobPlacement))
        g.add((jp_uri, HE.hasParticipant, phys_uri))
        g.add((jp_uri, HE.worksAt, fac_uri))
        if year_str:
            g.add((jp_uri, HE.hasTimeReference, BASE[f'year/{year_str}']))

    # Medical school enrichment (unchanged)
    if pd.notna(row.med_school):
        m = match_med_school(row.med_school)
        safe = re.sub(r"[^a-z0-9_]","_",str(row.med_school).lower())
        ms_uri = BASE[f'medSchool/{safe}']
        g.add((ms_uri, RDF.type, HE.MedicalSchool))
        g.add((ms_uri, RDFS.label, Literal(str(row.med_school).title())))
        g.add((phys_uri, HE.attendedMedSchool, ms_uri))
        if m:
            if pd.notna(m.get('abstract')):
                g.add((ms_uri, DBO.abstract, Literal(m['abstract'])))
                g.add((ms_uri, SCHEMA.description, Literal(m['abstract'])))
            if pd.notna(m.get('facultySize')):
                g.add((ms_uri, DBO.facultySize, Literal(int(float(m['facultySize'])), datatype=XSD.integer)))
            if pd.notna(m.get('numberOfStudents')):
                g.add((ms_uri, DBO.numberOfStudents, Literal(int(float(m['numberOfStudents'])), datatype=XSD.integer)))

# -----------------------------------------------------------------------------
# 8 ─ Year nodes
# -----------------------------------------------------------------------------
for yr in range(2015, 2021):
    year_uri = BASE[f'year/{yr}']
    g.add((year_uri, RDF.type, HE.Year))
    g.add((year_uri, RDFS.label, Literal(str(yr), datatype=XSD.gYear)))

# -----------------------------------------------------------------------------
# 9 ─ Print summary stats
# -----------------------------------------------------------------------------
print("\nGraph Statistics:")
print(f"  Total triples: {len(g):,}")
print(f"  Facilities with augmented data: {len([1 for _, row in fac.iterrows() if pd.notna(row.get('dbpedia_uri'))])}")
print(f"  Facilities with bed count: {len([1 for _, row in fac.iterrows() if pd.notna(row.get('numberOfBeds'))])}")
print(f"  Facilities with employee count: {len([1 for _, row in fac.iterrows() if pd.notna(row.get('numberOfEmployees'))])}")

# -----------------------------------------------------------------------------
# 10 ─ Serialize
# -----------------------------------------------------------------------------
print(f"\nRDF build complete — {len(g):,} triples")
g.serialize(OUTPUT, format='turtle')
print('Saved', OUTPUT.resolve())

In [None]:
!pip install rapidfuzz

In [None]:
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, OWL, XSD, DCTERMS

TTL_FILE = Path("healtheqkg_full.ttl")
OWL_OUT  = Path("healtheqkg_ontology.owl")

HE      = Namespace("http://healtheqkg.example.org/ontology#")
SCHEMA  = Namespace("http://schema.org/")
DBO     = Namespace("https://dbpedia.org/")
ONT_IRI = URIRef("http://healtheqkg.example.org/ontology")
VER_IRI = URIRef("http://healtheqkg.example.org/ontology/2025-05-15")

print(" Loading TTL graph …")
g_full = Graph().parse(TTL_FILE, format="turtle")

print(" Building OWL ontology …")
g = Graph()
for prefix, ns in [
    ("he", HE), ("schema", SCHEMA), ("dbo", DBO),
    ("owl", OWL), ("rdfs", RDFS), ("dcterms", DCTERMS)
]:
    g.bind(prefix, ns)

# Ontology declaration
g.add((ONT_IRI, RDF.type, OWL.Ontology))
g.add((ONT_IRI, OWL.versionIRI, VER_IRI))
g.add((ONT_IRI, DCTERMS.license,
       URIRef("https://creativecommons.org/licenses/by/4.0/")))
g.add((ONT_IRI, RDFS.label, Literal("HealthEQKG Ontology", lang="en")))
g.add((ONT_IRI, RDFS.comment, Literal(
    "Core ontology describing physicians, facilities, medical schools, postal addresses, "
    "zip code areas, ADI scores, and job placements.", lang="en")))

# Class definitions (including PostalAddress) with labels, comments, subclassOf
classes = [
    (HE.Physician,     "Physician",        "A healthcare professional licensed to practice medicine."),
    (HE.Facility,      "Healthcare Facility","An organization or location providing healthcare services."),
    (HE.MedicalSchool, "Medical School",    "An institution granting medical degrees."),
    (SCHEMA.PostalAddress, "Postal Address","A mailing address for a location."),
    (HE.ZipCodeArea,   "Zip Code Area",     "A geographic area defined by a 5-digit postal code."),
    (HE.ADIScore,      "ADI Score",         "Area Deprivation Index score for a geographic area in a given year."),
    (HE.Year,          "Year",              "A calendar year represented as a node."),
    (HE.JobPlacement,  "Job Placement",     "An event linking a physician to a facility in a particular year.")
]
for cls, lbl, cmt in classes:
    g.add((cls, RDF.type, OWL.Class))
    g.add((cls, RDFS.subClassOf, OWL.Thing))
    g.add((cls, RDFS.label, Literal(lbl, lang="en")))
    g.add((cls, RDFS.comment, Literal(cmt, lang="en")))

# Property definitions with domain, range, labels, comments
properties = [
    (HE.hasParticipant,    "has participant",      "Links a JobPlacement to a Physician.", HE.JobPlacement,   HE.Physician),
    (HE.worksAt,           "works at",             "Links a JobPlacement to a Facility.", HE.JobPlacement, HE.Facility),
    (HE.hasTimeReference,  "has time reference",   "Associates a JobPlacement with the Year of occurrence.", HE.JobPlacement, HE.Year),
    (SCHEMA.address,       "address",              "Postal address of a Facility.", HE.Facility, SCHEMA.PostalAddress),
    (HE.zipCodeValue,      "zip code value",       "The 5-digit ZIP code as a string.", HE.ZipCodeArea, XSD.string),
    (HE.hasADISummary,     "has ADI summary",      "Links a Zip Code Area to an ADI Score.", HE.ZipCodeArea, HE.ADIScore),
    (HE.nationalScore,     "national score",       "National ADI percentile rank.", HE.ADIScore, XSD.integer),
    (HE.referenceYear,     "reference year",       "Year of the ADI score.", HE.ADIScore, XSD.gYear),
    (HE.credential,        "credential",           "Medical credential (e.g., MD, DO).", HE.Physician, XSD.string),
    (SCHEMA.medicalSpecialty, "medical specialty",  "Primary medical specialty of a Physician.", HE.Physician, XSD.string),
    (HE.graduationYear,    "graduation year",      "Year a Physician graduated medical school.", HE.Physician, XSD.gYear),
    (HE.attendedMedSchool, "attended medical school","Links Physician to their Medical School.", HE.Physician, HE.MedicalSchool),
    # DBpedia augmentations
    (DBO.abstract,         "DBpedia abstract",     "Abstract from DBpedia.", HE.MedicalSchool, RDFS.Literal),
    (DBO.facultySize,      "faculty size",         "Number of faculty at the MedicalSchool.", HE.MedicalSchool, XSD.integer),
    (DBO.numberOfStudents, "number of students",   "Number of students at the MedicalSchool.", HE.MedicalSchool, XSD.integer)
]
for prop, lbl, cmt, dom, rng in properties:
    g.add((prop, RDF.type, RDF.Property))
    g.add((prop, RDFS.domain, dom))
    g.add((prop, RDFS.range, rng))
    g.add((prop, RDFS.label, Literal(lbl, lang="en")))
    g.add((prop, RDFS.comment, Literal(cmt, lang="en")))

# Disjointness axioms among core classes
for a, b in [
    (HE.Physician, HE.Facility),
    (HE.Physician, HE.MedicalSchool),
    (HE.Facility,  HE.MedicalSchool)
]:
    g.add((a, OWL.disjointWith, b))

# Serialize to RDF/XML
print(f" Ontology triples: {len(g):,}")
g.serialize(OWL_OUT, format="xml")
print(" Saved", OWL_OUT.resolve())
