# Integration of Schema.org Organization Taxonomy into the NCO Ontology

This notebook reads the TTL file `nco2_2_schemaorgv2.ttl` and integrates the complete Schema.org Organization taxonomy into the NCO ontology, creating the file `nco2_2_schemaorgv2_0.ttl` with correct hierarchical relationships.

In [None]:
# Install required packages for RDF/TTL manipulation
#%pip install rdflib requests

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Import necessary libraries
import os
import requests
from rdflib import Graph, Namespace, RDF, RDFS, OWL, URIRef, Literal
import json
from pathlib import Path

In [None]:
# Configure paths and namespaces
current_dir = Path.cwd()
ttl_file_path = current_dir / "nco2_2_schemaorgv2.ttl"

# Definizione dei namespace
SCHEMA = Namespace("https://schema.org/")
NCO = Namespace("http://data.open.ac.uk/ontology/newsclassification/v2#")
NCO_SCHEMA = Namespace("http://data.open.ac.uk/ontology/newsclassification/nco_schema.org#")

print(f"TTL file path: {ttl_file_path}")
print(f"File exists: {ttl_file_path.exists()}")

Percorso file TTL: c:\Users\massi\Documents\Github\ClaimExtractionAgent\Code\Ontology\nco2_2_schemaorgv2.ttl
File esiste: True


In [None]:
# Load existing TTL file
g = Graph()

try:
    g.parse(ttl_file_path, format='turtle')
    print(f"TTL file loaded successfully!")
    print(f"Number of triples loaded: {len(g)}")
    print("\nNamespaces present:")
    for prefix, namespace in g.namespaces():
        print(f"  {prefix}: {namespace}")
        
except Exception as e:
    print(f"Error loading TTL file: {e}")
    
# Verifica la presenza di Organization nel grafo
org_triples = list(g.triples((None, None, SCHEMA.Organization)))
print(f"\nTriple that involve schema:Organization: {len(org_triples)}")
for triple in org_triples[:5]:  # Mostra solo le prime 5
    print(f"  {triple}")

File TTL caricato con successo!
Numero di triple caricate: 54

Namespace presenti:
  brick: https://brickschema.org/schema/Brick#
  csvw: http://www.w3.org/ns/csvw#
  dc: http://purl.org/dc/elements/1.1/
  dcat: http://www.w3.org/ns/dcat#
  dcmitype: http://purl.org/dc/dcmitype/
  dcam: http://purl.org/dc/dcam/
  doap: http://usefulinc.com/ns/doap#
  foaf: http://xmlns.com/foaf/0.1/
  geo: http://www.opengis.net/ont/geosparql#
  odrl: http://www.w3.org/ns/odrl/2/
  org: http://www.w3.org/ns/org#
  prof: http://www.w3.org/ns/dx/prof/
  prov: http://www.w3.org/ns/prov#
  qb: http://purl.org/linked-data/cube#
  schema: https://schema.org/
  sh: http://www.w3.org/ns/shacl#
  skos: http://www.w3.org/2004/02/skos/core#
  sosa: http://www.w3.org/ns/sosa/
  ssn: http://www.w3.org/ns/ssn/
  time: http://www.w3.org/2006/time#
  vann: http://purl.org/vocab/vann/
  void: http://rdfs.org/ns/void#
  wgs: https://www.w3.org/2003/01/geo/wgs84_pos#
  owl: http://www.w3.org/2002/07/owl#
  rdf: http://ww

In [None]:
# Functions to download and parse the schema.org ontology
def get_schema_org_organization_taxonomy():
    """Download the RDF ontology of schema.org and build the complete tree of Organization subclasses"""
    schema_rdf_url = "https://schema.org/version/latest/schemaorg-current-https.rdf"
    
    try:
        print("Downloading schema.org RDF ontology...")
        schema_graph = Graph()
        schema_graph.parse(schema_rdf_url, format='xml')
        print(f"Ontology loaded with {len(schema_graph)} triples")
        
        # Estrai TUTTE le relazioni rdfs:subClassOf per le organizzazioni
        organization_classes = extract_organization_hierarchy(schema_graph)
        print(f"Extracted {len(organization_classes)} organization classes with hierarchy")
        return organization_classes
        
    except Exception as e:
        print(f"Error downloading/parsing RDF ontology: {e}")
        return get_organization_fallback_taxonomy()

def extract_organization_hierarchy(graph):
    """Extract all classes related to Organization while preserving the original hierarchy"""
    organization_classes = {}
    
    # Prima, trova tutte le classi che sono sottoclassi di Organization (direttamente o indirettamente)
    org_related_query = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX schema: <https://schema.org/>
    SELECT DISTINCT ?cls ?label ?comment ?parent
    WHERE {
        ?cls rdfs:subClassOf* schema:Organization .
        ?cls rdfs:subClassOf ?parent .
        OPTIONAL { ?cls rdfs:label ?label }
        OPTIONAL { ?cls rdfs:comment ?comment }
        FILTER(STRSTARTS(STR(?cls), "https://schema.org/"))
    }
    """
    
    try:
        results = graph.query(org_related_query)
        
        # Build class map
        for row in results:
            class_uri = str(row.cls)
            class_name = class_uri.split('/')[-1]
            parent_uri = str(row.parent)
            
            # Initialize class if not exists
            if class_name not in organization_classes:
                organization_classes[class_name] = {
                    'id': class_uri,
                    'label': str(row.label) if row.label else class_name,
                    'comment': str(row.comment) if row.comment else "",
                    'subClassOf': [],
                    'children': []
                }
            
            # Add parent relation if not already present
            if parent_uri not in organization_classes[class_name]['subClassOf']:
                organization_classes[class_name]['subClassOf'].append(parent_uri)
        
        # Add Organization if not present
        if 'Organization' not in organization_classes:
            organization_classes['Organization'] = {
                'id': 'https://schema.org/Organization',
                'label': 'Organization',
                'comment': 'An organization such as a school, NGO, corporation, club, etc.',
                'subClassOf': ['https://schema.org/Thing'],
                'children': []
            }
        
        # Build children relationships
        for class_name, details in organization_classes.items():
            for parent_uri in details['subClassOf']:
                parent_name = parent_uri.split('/')[-1]
                if parent_name in organization_classes:
                    if class_name not in organization_classes[parent_name]['children']:
                        organization_classes[parent_name]['children'].append(class_name)
        
        return organization_classes
        
    except Exception as e:
        print(f"Error extracting hierarchy: {e}")
        return {}

def get_organization_fallback_taxonomy():
    """Lista predefinita delle principali sottoclassi di Organization da schema.org"""
    fallback_classes = {
        'Organization': {
            'id': 'https://schema.org/Organization',
            'label': 'Organization',
            'comment': 'An organization such as a school, NGO, corporation, club, etc.',
            'subClassOf': ['https://schema.org/Thing'],
            'children': ['Corporation', 'EducationalOrganization', 'GovernmentOrganization', 'LocalBusiness', 'NGO', 'NewsMediaOrganization']
        },
        'Corporation': {
            'id': 'https://schema.org/Corporation',
            'label': 'Corporation',
            'comment': 'Organization: A business corporation.',
            'subClassOf': ['https://schema.org/Organization'],
            'children': []
        },
        'EducationalOrganization': {
            'id': 'https://schema.org/EducationalOrganization',
            'label': 'Educational Organization',
            'comment': 'An educational organization.',
            'subClassOf': ['https://schema.org/Organization'],
            'children': []
        },
        'GovernmentOrganization': {
            'id': 'https://schema.org/GovernmentOrganization',
            'label': 'Government Organization',
            'comment': 'A governmental organization or agency.',
            'subClassOf': ['https://schema.org/Organization'],
            'children': []
        },
        'LocalBusiness': {
            'id': 'https://schema.org/LocalBusiness',
            'label': 'Local Business',
            'comment': 'A particular physical business or branch of an organization.',
            'subClassOf': ['https://schema.org/Organization'],
            'children': []
        },
        'NGO': {
            'id': 'https://schema.org/NGO',
            'label': 'NGO',
            'comment': 'Organization: Non-governmental Organization.',
            'subClassOf': ['https://schema.org/Organization'],
            'children': []
        },
        'NewsMediaOrganization': {
            'id': 'https://schema.org/NewsMediaOrganization',
            'label': 'News Media Organization',
            'comment': 'A News/Media organization such as a newspaper or TV station.',
            'subClassOf': ['https://schema.org/Organization'],
            'children': []
        }
    }
    
    print(f"Utilizzando tassonomia fallback con {len(fallback_classes)} classi")
    return fallback_classes

def integrate_organization_taxonomy(graph, org_taxonomy):
    """Integra l'albero delle organizzazioni nell'ontologia NCO preservando la gerarchia"""
    
    # Bind dei namespace necessari
    graph.bind("schema", SCHEMA)
    graph.bind("nco", NCO)
    graph.bind("rdf", RDF)
    graph.bind("rdfs", RDFS)
    graph.bind("owl", OWL)
    
    classes_added = 0
    relations_added = 0
    
    print("Integrazione dell'albero delle organizzazioni...")
    
    # Aggiungi tutte le classi
    for class_name, details in org_taxonomy.items():
        class_uri = URIRef(details['id'])
        
        # Verifica se la classe esiste gi√†
        existing = list(graph.triples((class_uri, RDF.type, OWL.Class)))
        if existing:
            continue
        
        # Aggiungi la classe
        graph.add((class_uri, RDF.type, OWL.Class))
        
        if details.get('label'):
            graph.add((class_uri, RDFS.label, Literal(details['label'], lang='en')))
        
        if details.get('comment'):
            graph.add((class_uri, RDFS.comment, Literal(details['comment'], lang='en')))
        
        classes_added += 1
    
    # Aggiungi TUTTE le relazioni gerarchiche PRESERVANDO LA STRUTTURA ORIGINALE
    for class_name, details in org_taxonomy.items():
        class_uri = URIRef(details['id'])
        
        # Usa TUTTE le relazioni subClassOf originali da Schema.org
        for parent_uri_str in details.get('subClassOf', []):
            parent_uri = URIRef(parent_uri_str)
            existing_rel = list(graph.triples((class_uri, RDFS.subClassOf, parent_uri)))
            if not existing_rel:
                graph.add((class_uri, RDFS.subClassOf, parent_uri))
                relations_added += 1
        
        # Relazioni speciali con NCO
        if class_name == 'Organization':
            graph.add((class_uri, OWL.equivalentClass, NCO.Organization))
            relations_added += 1
        elif class_name == 'NewsMediaOrganization':
            graph.add((class_uri, RDFS.subClassOf, NCO.Publisher))
            relations_added += 1
    
    print(f"Integrazione completata: {classes_added} classi, {relations_added} relazioni")
    return classes_added, relations_added

# Ottieni la tassonomia delle organizzazioni
print("=== INIZIO PROCESSO DI INTEGRAZIONE ===")
org_taxonomy = get_schema_org_organization_taxonomy()

=== INIZIO PROCESSO DI INTEGRAZIONE ===
Scaricamento ontologia RDF di schema.org...
Ontologia caricata con 17231 triple
Ontologia caricata con 17231 triple
Estratte 185 classi di organizzazioni con gerarchia
Estratte 185 classi di organizzazioni con gerarchia


In [None]:
# Display obtained taxonomy
print("=== SCHEMA.ORG ORGANIZATION TAXONOMY ===")
print(f"Found {len(org_taxonomy)} organization classes")

# Mostra alcune classi principali
key_classes = ['Organization', 'Corporation', 'GovernmentOrganization', 'NewsMediaOrganization', 'NGO', 'LocalBusiness']
for class_name in key_classes:
    if class_name in org_taxonomy:
        details = org_taxonomy[class_name]
        print(f"‚úì {details['label']} ({class_name})")

# Mostra statistiche della gerarchia
def count_hierarchy_levels(org_taxonomy):
    """Conta i livelli gerarchici"""
    level_counts = {}
    for class_name, details in org_taxonomy.items():
        if class_name == 'Organization':
            continue
        
        # Conta i livelli dalla radice Organization
        levels = 0
        current_parents = details.get('subClassOf', [])
        while current_parents and levels < 10:  # Limite per evitare cicli
            levels += 1
            # Controlla se uno dei genitori √® Organization
            if 'https://schema.org/Organization' in current_parents:
                break
            # Trova i genitori del prossimo livello
            next_parents = []
            for parent_uri in current_parents:
                parent_name = parent_uri.split('/')[-1]
                if parent_name in org_taxonomy:
                    next_parents.extend(org_taxonomy[parent_name].get('subClassOf', []))
            current_parents = next_parents
        
        if levels > 0:
            level_counts[levels] = level_counts.get(levels, 0) + 1
    
    return level_counts

level_stats = count_hierarchy_levels(org_taxonomy)
print("\nHierarchy structure:")
for level in sorted(level_stats.keys()):
    print(f"  Level {level}: {level_stats[level]} classes")
print(f"Maximum depth: {max(level_stats.keys()) if level_stats else 0} levels")

# Integra la tassonomia nell'ontologia
print("\n=== INTEGRAZIONE NELL'ONTOLOGIA NCO ===")
added_classes, added_relations = integrate_organization_taxonomy(g, org_taxonomy)

=== TASSONOMIA ORGANIZATION DI SCHEMA.ORG ===
Trovate 185 classi di organizzazioni
‚úì Organization (Organization)
‚úì Corporation (Corporation)
‚úì GovernmentOrganization (GovernmentOrganization)
‚úì NewsMediaOrganization (NewsMediaOrganization)
‚úì NGO (NGO)
‚úì LocalBusiness (LocalBusiness)

Struttura gerarchica:
  Livello 1: 20 classi
  Livello 2: 49 classi
  Livello 3: 115 classi
Profondit√† massima: 3 livelli

=== INTEGRAZIONE NELL'ONTOLOGIA NCO ===
Integrazione dell'albero delle organizzazioni...
Integrazione completata: 185 classi, 204 relazioni


In [None]:
# Ontology validation and saving
print("=== VALIDATION AND SAVING ===")

# Verifica alcune classi importanti
important_classes = ['Organization', 'NewsMediaOrganization', 'GovernmentOrganization', 'Corporation']
for class_name in important_classes:
    if class_name in org_taxonomy:
        class_uri = URIRef(org_taxonomy[class_name]['id'])
        triples_count = len(list(g.triples((class_uri, None, None))))
        print(f"‚úì {class_name}: {triples_count} triples")

# Correzione di eventuali relazioni gerarchiche mancanti
organization_uri = SCHEMA.Organization
missing_relations = 0

for class_name, details in org_taxonomy.items():
    if class_name == 'Organization':
        continue
    
    class_uri = URIRef(details['id'])
    
    # Verifica se questa classe ha Organization come genitore diretto
    if str(organization_uri) in details.get('subClassOf', []):
        existing_relation = list(g.triples((class_uri, RDFS.subClassOf, organization_uri)))
        if not existing_relation:
            g.add((class_uri, RDFS.subClassOf, organization_uri))
            missing_relations += 1

if missing_relations > 0:
    print(f"‚úì Fixed {missing_relations} missing hierarchical relations")

# Rimuovere eventuali duplicazioni di Organization
duplicate_org_uri = URIRef("http://schema.org/Organization")
nco_person_org = URIRef("http://data.open.ac.uk/ontology/newsclassification/v2#PersonOrOganization")

g.remove((duplicate_org_uri, RDF.type, OWL.Class))
g.remove((duplicate_org_uri, RDFS.subClassOf, nco_person_org))

print("‚úì Rimosse eventuali duplicazioni di Organization")

# Salvataggio dell'ontologia
output_filename = "nco2_2_schemaorgv2_0.ttl"
output_path = current_dir / output_filename

try:
    # Aggiorna i metadati dell'ontologia
    ontology_uri = URIRef("http://data.open.ac.uk/ontology/newsclassification/nco_schema.org")
    version_uri = URIRef("http://github.com/EnricoMotta/ontologies/newsclassification/nco+schema.org/1.1.0")
    g.set((ontology_uri, OWL.versionIRI, version_uri))
    
    enhanced_comment = Literal(
        "This ontology integrates the News Classification Ontology (NCO) v2.2 with Schema.Org. "
        "The purpose of this integration is to import the complete taxonomy of organizations "
        "available in Schema.Org into NCO, preserving the original hierarchical structure.",
        lang="en"
    )
    g.set((ontology_uri, RDFS.comment, enhanced_comment))
    
    # Salva il file TTL
    g.serialize(destination=str(output_path), format='turtle')
    
    # Statistiche finali
    total_subclasses = len(list(g.triples((None, RDFS.subClassOf, None))))
    direct_org_subclasses = len(list(g.triples((None, RDFS.subClassOf, organization_uri))))
    
    print(f"\n‚úÖ PROCESSO COMPLETATO!")
    print(f"üìÑ File salvato: {output_path}")
    print(f"üìä Triple totali: {len(g):,}")
    print(f"üìè Dimensione file: {output_path.stat().st_size / 1024:.1f} KB")
    print(f"üè¢ Relazioni gerarchiche: {total_subclasses}")
    print(f"üéØ Sottoclassi dirette di Organization: {direct_org_subclasses}")
    
except Exception as e:
    print(f"‚ùå Error saving: {e}")

=== VALIDAZIONE E SALVATAGGIO ===
‚úì Organization: 5 triple
‚úì NewsMediaOrganization: 5 triple
‚úì GovernmentOrganization: 4 triple
‚úì Corporation: 4 triple
‚úì Rimosse eventuali duplicazioni di Organization

‚úÖ PROCESSO COMPLETATO!
üìÑ File salvato: c:\Users\massi\Documents\Github\ClaimExtractionAgent\Code\Ontology\nco2_2_schemaorgv2_0.ttl
üìä Triple totali: 811
üìè Dimensione file: 42.4 KB
üè¢ Relazioni gerarchiche: 203
üéØ Sottoclassi dirette di Organization: 20
