# Experimental notebook for document structured knowledge extraction

This is the fourth iteration of the pipeline, the first one to be pushed in the repo, and thus the first official version. Version history will now be handled by git.


In [1]:
# Standard library
import os
import json
import time
import re
import hashlib
from pathlib import Path
import webbrowser
from typing import List
import csv

# Environment/config
from dotenv import load_dotenv

# Validation / data models
from pydantic import BaseModel

# LLM client
from google import genai

# RDF / graph libraries
import rdflib
from rdflib import Graph, Namespace, RDF, RDFS, Literal
from rdflib.compare import to_isomorphic
from rdflib.plugins.sparql import prepareQuery
from rdflib.plugins.parsers.notation3 import BadSyntax
from rdflib.namespace import SH

# Visualization and SHACL
from pyvis.network import Network
from pyshacl import validate

# Parsing helpers
import pyparsing
from pyexpat.errors import messages

import datetime


### Initialization


In [2]:
# Initialize client 
client = genai.Client()
gemini_model = "gemini-2.5-pro"

load_dotenv()
GEMINI_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_KEY:
    raise RuntimeError("GEMINI_API_KEY not found. Add it to a .env file in the notebook root.")

# Which document we'll be testing
document_name = "student_housing"

Aux functions.

In [3]:
# Text generation in general
def call_gemini(content):
    response = client.models.generate_content(
        model = gemini_model,
        contents = content
        )

    return response.text

# Document understanding 
def call_gemini_pdf(content, file_name):    
    # Retrieve and encode the PDF byte
    file_path = Path(file_name)

    # Upload the PDF using the File API
    content_file = client.files.upload(file = file_path)

    response = client.models.generate_content(
        model = gemini_model,
        contents=[content_file, content]
        )

    return response.text

# JSON structured output
def call_gemini_json(content, schema):
    response = client.models.generate_content(
        model = gemini_model,
        contents=content,
        config={
            "response_mime_type": "application/json",
            "response_schema": schema
            }
        )

    return response.text

# Read txt file into a string (used for prompts)
def read_txt(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()
    
# Read a JSON file and return it as string or raw JSON
def read_json(path, raw = False):
    with open(path, "r") as f:
        json_file = json.load(f)
    if (raw):
        return json_file
    # Else, convert JSON to string 
    return json.dumps(json_file, indent=2)

# Retry wrapper to combat model overload errors
def with_retries(func, *args, base_delay=4.0):
    overloads = 0    
    exhaustions = 0
    while True:
        try:
            return func(*args)
        except Exception as e:
            msg = str(e).lower()
            overloaded = "overloaded" in msg
            exhausted = "exhausted" in msg            
            if overloaded:
                overloads += 1
                wait = base_delay * (2 ** overloads)
                print(f"Gemini overloaded {overloads} times, retrying in {wait:.1f}s...")
                time.sleep(wait)
                continue
            elif exhausted:
                exhaustions +=1
                print(f"Gemini exhausted {exhaustions} times, waiting 1 minute and rerunning the code...")
                time.sleep(60) 
                overloads = 0                
                continue          
            # Anything else
            raise

# Pyvis graph visualization
def visualize_graph(ttl_file):
    # Load TTL file
    g = Graph()
    g.parse(ttl_file, format="turtle")  

    # Namespace
    CCCEV = Namespace("http://data.europa.eu/m8g/")
    CPSV = Namespace("http://purl.org/vocab/cpsv#")
    EX = Namespace("http://example.org/")
    SC = Namespace("http://example.org/schema#")

    net = Network(height="1440px", width="100%", notebook=True, directed=True, cdn_resources='remote')
    net.force_atlas_2based()

    # Just visual effects
    def node_color(uri):
        if (uri, RDF.type, CPSV.PublicService) in g:
            return "gold"
        if (uri, RDF.type, CCCEV.Constraint) in g:
            return "maroon"
        if (uri, RDF.type, CCCEV.InformationConcept) in g:
            return "darkturquoise"
        if (uri, RDF.type, SC.Applicant) in g:
            return "yellowgreen"
        return "lightgrey"

    # Just a way to make the graph more readable
    def node_label(uri):
        if isinstance(uri, Literal):
            return str(uri)

        for lbl in g.objects(uri, RDFS.label):
            return str(lbl)
        for lbl in g.objects(uri, CCCEV.name):
            return str(lbl)

        uri_str = str(uri).rstrip('/') # <--- SAFETY FIX
        if "#" in uri_str:
            return uri_str.split("#")[-1]
        return uri_str.split("/")[-1]

    # Add nodes and edges, skipping rdf:type for extra readability
    for s, p, o in g:
        if p == RDF.type:
            continue

        # Subject
        net.add_node(str(s), label=node_label(s), color=node_color(s))

        # Object
        if isinstance(o, Literal):
            net.add_node(str(o), label=node_label(o), color="beige", shape="box") # if it's a literal put it in a text box instead of a circular node
        else:
            net.add_node(str(o), label=node_label(o), color=node_color(o))

        # Edge
        net.add_edge(str(s), str(o), label=node_label(p), arrows="to")

    html_file = ttl_file.replace("ttl", "html")
    # Render and show
    net.save_graph(html_file)
    webbrowser.open("file://" + os.path.abspath(html_file)) # Use absolute path for browser

# Hashing Graphs
def get_semantic_hash(rdf_text):
    """
    Parses RDF, canonicalizes it, and returns a hash of the logical structure.
    Ignores formatting, whitespace and line order.
    """
    g = rdflib.Graph()
    try:
        g.parse(data=rdf_text, format="turtle")
    except Exception as e:
        return "INVALID_RDF"

    iso_g = to_isomorphic(g)
    
    # Generate a deterministic hash based on the sorted triples (the string representation)
    triples = sorted(list(iso_g))
    triples_string = "".join([str(t) for t in triples])
    return hashlib.md5(triples_string.encode('utf-8')).hexdigest()

# shacl shape syntax deep check
def validate_shacl_syntax(shacl_ttl_string):
    """
    Checks syntactic validity of a SHACL file on three levels:
    1. RDF/Turtle Syntax
    2. SHACL Structure (Basic)
    3. Embedded SPARQL Syntax
    
    Returns: (is_valid (bool), error_stage (str), error_message (str))
    """
    def shape_name(uri):
        # Helper to make error messages readable
        return uri.split("#")[-1] if "#" in uri else uri.split("/")[-1]
    
    g = Graph()
    
    # --- LEVEL 1: RDF Syntax ---
    try:
        g.parse(data=shacl_ttl_string, format="turtle")
    except (BadSyntax, Exception) as e:
        return False, "RDF_SYNTAX", str(e).replace("\n", " ")

    # Extract namespaces to help the SPARQL parser later
    namespaces = dict(g.namespaces())
    
    # --- LEVEL 2: Embedded SPARQL Syntax ---
    # We query the graph to find every 'sh:select' string
    query_finder = """
        PREFIX sh: <http://www.w3.org/ns/shacl#>
        SELECT ?shape ?sparql
        WHERE {
            ?shape sh:select ?sparql .
        }
    """
    
    try:
        results = g.query(query_finder)        
        for row in results:
            shape_uri = str(row.shape)
            sparql_string = str(row.sparql)
            
            try:
                # Attempt to compile the SPARQL string
                prepareQuery(sparql_string, initNs=namespaces)
                
            except pyparsing.ParseException as pe:
                # This captures syntax errors like missing brackets } or bad keywords
                return False, "SPARQL_SYNTAX", f"Shape {shape_name(shape_uri)}: {pe}"
            except Exception as e:
                return False, "SPARQL_OTHER", f"Shape {shape_name(shape_uri)}: {str(e)}"
                
    except Exception as e:
        return False, "QUERY_EXTRACTION", str(e)

    # If we survived all checks
    return True, "VALID", "OK"




## Preparation: Get everything ready for logging


In [4]:
# This function creates a blank slate for a single run
def initialize_run_context(run_id, doc_name, model_name):
    return {
        # --- Metadata ---
        "Run ID": run_id,
        "Document Name": doc_name,
        "Timestamp": datetime.datetime.now().isoformat(sep=" ", timespec="seconds"),
        "Model Name": model_name,
        
        # --- Pipeline Artifacts (Placeholders) ---
        "Service Graph Hash": "N/A",
        "SHACL Graph Hash": "N/A",
        "SHACL Valid Syntax": False,
        "SHACL Error Type": "N/A",
        "SHACL Error Message": "N/A",
        
        # --- Scenario Specifics (Will be overwritten per scenario) ---
        "Scenario ID": "N/A",
        "Scenario Description": "N/A",
        "Expected Violation Count": 0,
        "Actual Violation Count": 0,
        "Violated Shapes": [],
        "Violation Messages": [],
        "Raw Validation Report": "N/A",
        
        # --- Execution Stats ---
        "Execution Time": 0.0,
        "Successfully Executed": False,
    }
    
# We begin
current_run_id = 1 # this will be part of the framework later
ctx = initialize_run_context(current_run_id, document_name, gemini_model)
execution_start_time = time.time()

## Phase 1: Public service modeling


### 1.1 Document → Preconditions Summary

Use LLM to summarize the document into a list of preconditions.

In [5]:
file_name = f"Precondition documents/{document_name}.pdf"

prompt = read_txt('Prompts/summarization.txt')

preconditions_summary = with_retries(call_gemini_pdf, prompt, file_name)
print(preconditions_summary)

# Save to a file too
with open(f"Results/{document_name} preconditions summary.txt", "w") as f:
    f.write(preconditions_summary)

Title: Housing allowance for students
Conditions:
- The student must be a citizen of Greece or another European Union country.
- The student must be enrolled as an undergraduate at a Higher Education Institution (AEI) in Greece, pursuing their first degree.
- The student's duration of study must not have exceeded the standard number of semesters required for their degree.
- The student must have successfully passed exams in at least half of the courses from the previous academic year.
- The annual family income must not exceed €30,000. This limit increases by €3,000 for each dependent child after the first one.
- The student must be renting accommodation in a city other than that of their main family residence, and the lease must be valid for at least six months.
- Neither the student nor their parents may have full ownership or usufruct of a residence in the city of study.
- The total area of real estate (owner-occupied or rented out) owned by the student or their parents must not exc

### 1.2. Preconditions Summary + Citizen Schema (TTL) → Information Model (JSON)



In [6]:
preconditions_summary = read_txt(f"Results/{document_name} preconditions summary.txt")
citizen_schema = read_txt(f"Citizens/{document_name} schema.ttl")

class Paths(BaseModel):
    path: List[str]
    datatype: str
    
class InformationConcept(BaseModel):
    name: str
    related_paths: List[Paths]  # links the concept to citizen data available
    
class Constraint(BaseModel):
    name: str
    desc: str
    constrains: List[InformationConcept]  

schema = list[Constraint]

# Formulate prompt content and call Gemini
prompt = read_txt('Prompts/preconditions_to_JSON.txt')
content = [prompt, preconditions_summary, citizen_schema]

info_model = with_retries(call_gemini_json, content, schema)

# Save to a file too
with open(f"Results/{document_name} information model.json", "w") as f:
    f.write(info_model)

### 1.3 Information Model (JSON) → Public Service Graph (TTL)

Use deterministic code to turn the JSON into a knowledge graph using TTL syntax.

In [7]:
PREFIXES = """@prefix ex: <http://example.org/> .
@prefix cccev: <http://data.europa.eu/m8g/> .
@prefix cpsv: <http://purl.org/vocab/cpsv#> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

"""

# Parse JSON string
info_model = read_json(f"Results/{document_name} information model.json", raw=True)

# Get service name. As per the prompt, it's in the first line of the preconditions summary file
with open(f"Results/{document_name} preconditions summary.txt") as f:
    line = f.readline()
    service_name = re.findall(r'Title: (.+)', line)[0].strip().replace(" ", "_")

triples = [PREFIXES]
triples.append(f"ex:{service_name} a cpsv:PublicService .\n\n")

# Convert constraints + concepts into triples
for constraint in info_model:
    constraint_name = constraint["name"]
    constraint_desc = constraint["desc"].replace('"', '\\"')

    # Public service -> holdsRequirement -> constraint
    triples.append(f"ex:{service_name} cpsv:holdsRequirement ex:{constraint_name} .\n")

    # Constraint node
    triples.append(f'ex:{constraint_name} a cccev:Constraint ; dct:description "{constraint_desc}" .\n')

    # InformationConcept nodes
    for concept in constraint.get("constrains", []):
        concept_name = concept["name"]

        # Link constraint to concept
        triples.append(f"ex:{constraint_name} cccev:constrains ex:{concept_name} .\n")

        # Declare information concept
        triples.append(f'ex:{concept_name} a cccev:InformationConcept .\n')

    triples.append("\n")  # spacing for readability

triples_string = "".join(triples)
# Save to a file too
with open(f"Results/{document_name} service graph.ttl", "w") as f:
    f.write(triples_string)   
     
# Log 
ctx["Service Graph Hash"] = get_semantic_hash(triples_string)

### 1.4. Graph Visualization / Inspection

Visualize part of the knowledge graph to more easily inspect correct structure and logic.

In [8]:
# Render graph of the public service
visualize_graph(f"Results/{document_name} service graph.ttl")

## Phase 2: SHACL Rule Generation


### 2.1. Information Model (JSON) → SHACL-spec (JSON)

Use deterministic code on the JSON from before to make a new intermediate JSON that contains only the necessary information to construct SHACL shapes, one for each constraint.

In [9]:
# Load the information model JSON
info_model = read_json(f"Results/{document_name} information model.json", raw=True)

shacl_spec_json = []

for constraint in info_model:
    # 1. Rename for clarity downstream
    shape_name = constraint["name"].replace("_condition", "_shape")
    desc = constraint["desc"]
    
    concepts = []
    
    # 2. Iterate concepts (e.g., family_income, residency_city)
    for concept in constraint.get("constrains", []):
        related_paths = []
        
        paths_source = concept.get("related_paths", []) 
        
        for rp in paths_source:
            # Capture the path AND the datatype (URI vs Literal)
            related_paths.append({
                "path": rp["path"],
                "datatype": rp["datatype"] 
            })
            
        concepts.append({
            "name": concept["name"],
            "related_paths": related_paths
        })
    
    shacl_spec_json.append({
        "shape_name": shape_name,
        "desc": desc,
        "concepts": concepts
    })

# Save to file
with open(f"Results/{document_name} shacl-spec.json", "w") as f:
    json.dump(shacl_spec_json, f, indent=2)

### 2.2. SHACL-spec (JSON) + Citizen Schema (TTL) → SHACL Shapes (TTL)



In [10]:
# Load JSON as string 
shacl_spec_json = read_json(f"Results/{document_name} shacl-spec.json")
citizen_schema = read_txt(f"Citizens/{document_name} schema.ttl")

prompt = read_txt('Prompts/shacl_spec_to_shacl_ttl.txt')
content = [prompt, shacl_spec_json, citizen_schema]

shacl_shapes = with_retries(call_gemini, content)

# Cleanup gemini markdown formatting
shacl_shapes = shacl_shapes.strip("`").replace("turtle", "").replace("ttl", "").strip()

# Save it to a file too
with open(f"Results/{document_name} shacl shapes.ttl", "w") as f:
    f.write(shacl_shapes)
    
# Log
ctx["SHACL Graph Hash"] = get_semantic_hash(shacl_shapes)

is_valid, error_stage, error_message = validate_shacl_syntax(shacl_shapes)
ctx["SHACL Valid Syntax"] = is_valid
ctx["SHACL Error Type"] = error_stage
ctx["SHACL Error Message"] = error_message


## Phase 3: Citizen - Service Modeling


### 3.1 Public Service Graph (TTL) + Citizen Graph (TTL) + Information Model (JSON) → Citizen-Service Graph (TTL) 

We expand the Public Service Graph to include Citizen data, properly connected with edges.

In [11]:
EX = Namespace("http://example.org/")
SC = Namespace("http://example.org/schema#")

# Load service and citizen ttl's and info model
public_service_ttl = f"Results/{document_name} service graph.ttl"
citizen_ttl = f"Citizens/{document_name} eligible.ttl"
info_model = read_json(f"Results/{document_name} information model.json", raw=True)

# Realize them into graphs
g = Graph()
g.parse(public_service_ttl, format="turtle")
citizen_g = Graph()
citizen_g.parse(citizen_ttl, format="turtle")

# Merge citizen triples into main graph
for t in citizen_g:
    g.add(t)
    
# Automatically determine the root citizen node 
root_candidates = list(citizen_g.subjects(predicate=None, object=SC.Applicant))
citizen_root = root_candidates[0]

# Helper: resolve node paths (return nodes, not literals) 
def resolve_node_path(citizen_g, root_uri, path_list, datatype):
    
    # 1. Determine how deep to go
    if datatype == "URI":
        # For Identity logic (City, Person), the Value IS the Node.
        traversal_parts = path_list
    else:
        # For Value logic (Income, Area), the Value is a Literal. Stop one step BEFORE the literal to get the Node holding it.
        traversal_parts = path_list[:-1]

    # 2. Traverse
    current_nodes = {root_uri}
    
    for part in traversal_parts:
        next_nodes = set()
        pred = SC[part] # Assumes our schema matches the namespace
        
        for node in current_nodes:
            # Find all objects connected by this predicate
            for obj in citizen_g.objects(node, pred):
                # Safety check: Ensure we don't accidentally traverse into a Literal 
                # (unless it's the final step of a URI path, but usually URIs point to URIs)
                if isinstance(obj, Literal) and datatype == "URI":
                     continue # Skip weird data errors
                next_nodes.add(obj)
        
        current_nodes = next_nodes
        
        # Optimization: If dead end, stop early
        if not current_nodes:
            return set()

    return current_nodes

# Add mapsTo edges  
for constraint in info_model:
    for concept in constraint["constrains"]:
        concept_uri = EX[concept["name"]]

        for path_obj in concept["related_paths"]: 
            path_list = path_obj["path"]
            dtype = path_obj["datatype"] 
            
            # Pass the datatype to the resolver
            subject_nodes = resolve_node_path(citizen_g, citizen_root, path_list, dtype)

            for subj in subject_nodes:
                # Connect the Information Concept to the Data Node
                g.add((concept_uri, EX.mapsTo, subj))

# Serialize unified graph into ttl and save to file
g.serialize(f"Results/{document_name} citizen-service graph.ttl", format="turtle")

<Graph identifier=N1cb68ebf0f6d4d28b11f2ce08cd1f491 (<class 'rdflib.graph.Graph'>)>

### 3.2 Visualize the unified graph

We reuse the same function from before.

In [12]:
visualize_graph(f"Results/{document_name} citizen-service graph.ttl")

## Phase 4: Citizen Validation and Metrics

### SHACL Shape Validation

Use deterministic code and a pre-made synthetic citizen graph, constructed to fulfil all conditions, and check it against the shape. If something went wrong, a list of violations and comments will be printed.

In [13]:
def parse_validation_report(conforms, results_graph, results_text):
    """
    Parses the SHACL validation graph into a flat dictionary for CSV logging.
    """
    # If it passed, return a clean success record
    if conforms:
        return {
            "violation_count": 0,
            "failed_shapes": "None",
            "messages": "None",
            "full_report": "Conforms: True"
        }

    # If it failed, extract details from the graph
    violations = []
    failed_shapes = set()
    messages = []

    # Find all nodes of type sh:ValidationResult
    for result_node in results_graph.subjects(RDF.type, SH.ValidationResult):
        
        # Extract the Shape Name (Source Shape) should return a URI like http://example.org/income_shape
        source_shape = results_graph.value(result_node, SH.sourceShape)
        if source_shape:
            # Split to get just "income_shape"
            shape_name = str(source_shape).split("/")[-1].split("#")[-1]
            failed_shapes.add(shape_name)

        # Extract the Message
        message = results_graph.value(result_node, SH.resultMessage)
        if message:
            messages.append(str(message))

    return {
        "violation_count": len(messages),
        "failed_shapes": "; ".join(sorted(list(failed_shapes))), # Stringify for CSV
        "messages": " | ".join(messages), # Stringify for CSV
        "full_report": results_text # Keep raw text just in case
    }

In [14]:
# those will be drawn from the scenario framework later
ctx["Scenario ID"] = 1
ctx["Scenario Description"] = "Lease duration too short."
ctx["Expected Violation Count"] = 1

# execute mutation scenario
conforms, results_graph, results_text = validate(
    data_graph=f"Citizens/{document_name} not eligible.ttl",
    shacl_graph=f"Results/{document_name} shacl shapes.ttl",
    inference='rdfs',
)

parse_result = parse_validation_report(conforms, results_graph, results_text)
ctx["Actual Violation Count"] = parse_result["violation_count"]
ctx["Violated Shapes"] = parse_result["failed_shapes"]
ctx["Violation Messages"] = parse_result["messages"]

ctx["Execution Time"] = round(time.time() - execution_start_time)
ctx["Successfully Executed"] = True

In [15]:
# Write to csv
CSV_FILE = "Master_Results.csv"

def flush_context_to_csv(context_dict):
    # Read headers from the existing file
    with open(CSV_FILE, 'r', encoding='utf-8', newline='') as f:
        reader = csv.reader(f)
        headers = next(reader) # Grabs the first row (the headers)

    # Align data to those headers
    row_data = []
    for h in headers:
        value = context_dict.get(h, "N/A") # safe get with default
        row_data.append(value)

    # Write the row
    with open(CSV_FILE, 'a', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(row_data)
        
    print(f"-> Logged Run {context_dict.get('Run ID', '?')} to CSV.")


flush_context_to_csv(ctx)


-> Logged Run 1 to CSV.
