In [56]:
import numpy as np
import pandas as pd

import os
import networkx as nx
from matplotlib import pyplot as plt
from rdflib import Graph
from rdflib.plugins.sparql import prepareQuery

# Quality Assessment


#### Load Graphs

In [57]:
g = Graph() # the oxigraph stuff makes queries run faster

g.parse('KnowledgeGraphs/knowledge_graph.ttl', format="turtle")
g.parse('KnowledgeGraphs/ontology_graph.ttl', format="turtle")


<Graph identifier=N424c195da8d04de98ba709f5f8105cb8 (<class 'rdflib.graph.Graph'>)>

#### Correctness - Check Literals



In [None]:
# Adapted from lab 6 (REFERENCE)
# SPARQL queries to check datatypes of literals
queries ="""
prefix dbo:    <http://dbpedia.org/ontology/> 
PREFIX rdf:    <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs:   <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?literal ?range ?literalType
WHERE 
{ 
   ?s ?p ?literal . 
   FILTER ( isLiteral(?literal) &&  ?p != rdfs:label )
   ?p rdfs:range ?range .
   BIND (datatype(?literal) AS ?literalType)

   FILTER (?range != ?literalType)
}

"""

query = prepareQuery(queries)
result = g.query(query)

for row in result:
   print(f"object {row['literal']}, has type: {row['range']}, is type: {row['literalType']} ")
    


In [38]:
# Adapted from lab 6 (REFERENCE)
# SPARQL queries to check datatypes
queries ="""
prefix dbo:     <http://dbpedia.org/ontology/> 
PREFIX rdf:     <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs:    <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wdp: <https://www.wikidata.org/wiki/Property:>

SELECT DISTINCT ?obj ?objType ?range ?pred
WHERE {
    ?sub ?pred ?obj .

    ?pred rdfs:range ?range .

    # Check that the type of ?obj is the same as ?range
    ?obj rdf:type ?objType .
 
   FILTER NOT EXISTS { ?obj rdf:type/rdfs:subClassOf* ?range}
   FILTER(?pred != wdp:P1552)
} 
"""

query = prepareQuery(queries)
result = g.query(query)

for row in result:
   print(f"Object: {row['obj']}, Object Type: {row['objType']}, Range: {row['range']}, Pred: {row['pred']}")



## Conciseness  


#### Each artist,song has one name


In [39]:
artist1_label_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX wd: <https://www.wikidata.org/wiki/>
PREFIX wdp: <https://www.wikidata.org/wiki/Property:>

SELECT ?artist ?label
WHERE {
  ?artist rdf:type wd:Q639669 .
  ?artist rdfs:label ?label .
}
GROUP BY ?artist
HAVING (COUNT(?label) > 1)
"""

query = prepareQuery(queries)
result = g.query(query)

for row in result:
    print(f"Artist: {row['artist']} has more than one label: {row['label']}")

In [40]:
song1_label_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX wd: <https://www.wikidata.org/wiki/>
PREFIX wdp: <https://www.wikidata.org/wiki/Property:>

SELECT ?song ?label
WHERE {
  ?song rdf:type wd:Q7366 .
  ?song rdfs:label ?label .
}
GROUP BY ?song
HAVING (COUNT(?label) > 1)

"""

query = prepareQuery(queries)
result = g.query(query)

for row in result:
    print(f"Song: {row['song']} has more than one label: {row['label']}")
    

#### Typing check - All subjects should have a type declared


In [53]:
# SPARQL queries to check datatypes
queries = """
prefix dbo:     <http://dbpedia.org/ontology/> 
PREFIX rdf:     <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs:    <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?sub ?domain ?pred 
WHERE {
    ?sub ?pred ?o .
    FILTER NOT EXISTS{ ?sub a ?st}
    ?pred rdfs:domain ?domain .
    
    
 
    
} 
"""
   # ?domain rdfs:label ?domainLabel.
   #  ?sub rdfs:label ?subLabel .
   #  ?pred rdfs:label ?predLabel .
   #  ?o rdfs:label ?oLabel .
 
query = prepareQuery(queries)
result = g.query(query)

for row in result:
   print(f"Subject: {row['sub']} has no type. And it should be of type {row['domain']}, according to the predicate {row['pred']}")

#### Check for missing values

In [58]:
# Adapted from lab 6 (REFERENCE)
# SPARQL queries to check datatypes of literals
queries = """
prefix dbo:    <http://dbpedia.org/ontology/> 
PREFIX rdf:    <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs:   <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?s ?pred ?literalVal
WHERE 
{ 
   ?s ?pred ?literal . 
   FILTER ( isLiteral(?literal))
   BIND (lcase(str(?literal)) AS ?literalVal)
   FILTER (?literalVal IN ("nan", "", "none" ))
}

"""

query = prepareQuery(queries)
result = g.query(query)

for row in result:
   print(f"Predicate {row['pred']}, has a placeholder value {row['literalVal']} for subject {row['s']}")
   

## Semantic Accuracy - Check for inconsistencies

#### Make sure that numeric values are not negative (like Sales, numberOfChildren etc.)

In [59]:
check_negative_values = """
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?resource ?resLabel ?property ?propLabel ?value
WHERE {
    ?resource ?property ?value .
    
    ?resource rdfs:label ?resLabel .
    ?property rdfs:label ?propLabel .
    FILTER(datatype(?value) = xsd:double)
    FILTER(?value < 0)
}
"""
query = prepareQuery(queries)
result = g.query(query)

for row in result:
    print(f"Resource: {row['resLabel']} has a negative value {row['value']} for property {row['propLabel']}")