# Padova Grand Tour - Sparql Ingester

This notebook will generate a `sparql.ttl` turtle file with the data obtained by the `sparql/SparqlRetriever.ipynb` notebook.

Setup graph:

In [91]:
import os
from pathlib import Path

from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF, XSD, schema.org
from rdflib.namespace import FOAF, XSD, SDO
from rdflib.collection import Collection

# Main namespace
PGT = Namespace("https://padovagrandtour.github.io/entitites#")
# Be careful! the "simple GEO" namespace is not the same as the "advanced GEO" namespace exported by rdflib
GEO = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")

# Saving folder
savePath =  path + "/data/ttlData/"
os.makedirs(savePath, exist_ok=True)


# Bind namespaces
g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("pgt", PGT)
g.bind("sdo", SDO)
g.bind("geo", GEO)


In [92]:
import json
# Read Json files
with open(path + '/sparql/categories.json') as f:
    categoriesJSON = json.load(f)

with open(path + '/sparql/museumProperties.json') as f:
    museumPropertiesJSON = json.load(f)

with open(path + '/sparql/museumThings.json') as f:
    museumThingsJSON = json.load(f)

with open(path + '/sparql/properties.json') as f:
    propertiesJSON = json.load(f)

with open(path + '/sparql/things.json') as f:
    thingsJSON = json.load(f)


In [93]:
# Add to graph while doing standard normalizations
def addToG(subject, predicate, obj, key, datatype):
    try: # Catch keyerrors
        if(obj[key] == obj[key]):   # check for NaN values
            if(datatype == XSD.string):
                g.add((subject, predicate, Literal(obj[key].strip(), datatype=datatype)))   
            elif(datatype == XSD.float):
                # change italian commas , to international point . for number
                g.add((subject, predicate, Literal(obj[key].strip().replace(",", "."), datatype=datatype)))   
            else:
                g.add((subject, predicate, Literal(obj[key], datatype=datatype)))    
    except Exception as e:
        pass


In [94]:
museumURLDict = {}
museumNameDict = {}


SITEindex = 0




for museumURL, siteData in museumPropertiesJSON.items():
    museumData = {}
    for [akey, aval] in siteData[0]:
        museumData[akey] = aval
    
    if(museumData['name'] in museumNameDict.keys()):
        print("skipping (already inserted)", museumData['name'])
        museumURLDict[museumURL] = museumNameDict[museumData['name']]
    else:

        # Setup museum node
        museumURLDict[museumURL] = "SITEsq" +  str(SITEindex)
        museumNameDict[museumData['name']] = "SITEsq" +  str(SITEindex)
        print("inserting", museumURL, museumData['name'])
        Museum = URIRef(PGT["SITEsq" +  str(SITEindex)])
        SITEindex = SITEindex + 1
        g.add((Museum, RDF.type, PGT.Museum))

        # Add museum properties
        g.add((Museum, SDO.url, Literal(museumURL, datatype=SDO.URL)))  
        addToG(Museum, SDO.name, museumData, 'name', datatype=XSD.string)
        addToG(Museum, SDO.description, museumData, 'desc', datatype=RDF.HTML)
        addToG(Museum, SDO.image, museumData, 'img', datatype=SDO.URL)
        addToG(Museum, GEO['lat'], museumData, 'lat', datatype=XSD.float)
        addToG(Museum, GEO['long'], museumData, 'long', datatype=XSD.float)



inserting http://dati.beniculturali.it/iccd/cf/resource/CulturalInstituteOrSite/1469799782221 Palazzo Folco
inserting https://w3id.org/arco/resource/Site/7cd721378d4eed24c5285df08594b4fc Palazzo Dolco
inserting http://dati.beniculturali.it/iccd/cf/resource/CulturalInstituteOrSite/1469744914369 Convento degli Eremitani
inserting https://w3id.org/arco/resource/Site/62a37ce270999c1ee4b1ba1cb472939b Università di Padova
inserting http://dati.beniculturali.it/iccd/cf/resource/CulturalInstituteOrSite/ICCD_CF_8076015143451 Palazzina del prefetto (ex)
inserting http://dati.beniculturali.it/iccd/cf/resource/CulturalInstituteOrSite/ICCD_CF_6156344978451 Complesso Ingegneria
inserting http://dati.beniculturali.it/iccd/cf/resource/CulturalInstituteOrSite/ICCD_CF_4223302714451 Palazzo ECA (ex)
inserting http://dati.beniculturali.it/iccd/cf/resource/CulturalInstituteOrSite/ICCD_CF_0160885714451 Complesso Cavalli
skipping (already inserted) Complesso Cavalli
inserting http://dati.beniculturali.it/icc

In [95]:

ARTWORKindex = 0


for artworkURL, artworkRawData in propertiesJSON.items():
    artworkData = {}
    for [akey, aval] in artworkRawData[0]:
        artworkData[akey] = aval
   

    # Setup museum node
    Artwork = URIRef(PGT["ARTWORKsq" +  str(ARTWORKindex)])
    ARTWORKindex = ARTWORKindex + 1
    g.add((Artwork, RDF.type, PGT.Artwork))

    # Add museum properties
    g.add((Artwork, SDO.url, Literal(artworkURL, datatype=SDO.URL)))  
    addToG(Artwork, SDO.name, artworkData, 'name', datatype=XSD.string)

    # Description (just pick the longest one)
    bestDesc = ""
    if(("desc" in artworkData) and (artworkData['desc'] == artworkData['desc']) and len(artworkData['desc']) > len(bestDesc)):
        bestDesc = artworkData['desc']
    
    if(("desc2" in artworkData) and (artworkData['desc2'] == artworkData['desc2']) and len(artworkData['desc2']) > len(bestDesc)):
        bestDesc = artworkData['desc2']

    if(("desc3" in artworkData) and (artworkData['desc3'] == artworkData['desc3']) and len(artworkData['desc3']) > len(bestDesc)):
        bestDesc = artworkData['desc3']

    if(len(bestDesc) > 0): g.add((Artwork, SDO.description, Literal(bestDesc, datatype=RDF.HTML)))  


    # Img (just pick the longest one)
    bestImg = ""
    if(("img" in artworkData) and (artworkData['img'] == artworkData['img']) and len(artworkData['img']) > len(bestImg)):
        bestImg = artworkData['img']
    
    if(("img2" in artworkData) and (artworkData['img2'] == artworkData['img2']) and len(artworkData['img2']) > len(bestImg)):
        bestImg = artworkData['img2']

    if(("img3" in artworkData) and (artworkData['img3'] == artworkData['img3']) and len(artworkData['img3']) > len(bestImg)):
        bestImg = artworkData['img3']

    if(len(bestImg) > 0): g.add((Artwork, SDO.image, Literal(bestImg, datatype=SDO.URL)))  

    # This will never fail, due to the way we selected the data in the first place
    # i.e., we are just considering artwork with an existing site in Padova 
    g.add((Artwork, PGT['hasSite'], URIRef(PGT[museumURLDict[artworkData['site']]])))

    addToG(Artwork, PGT['conservationState'], artworkData, 'conservation', datatype=XSD.string)
    addToG(Artwork, SDO.material, artworkData, 'material', datatype=XSD.string)

    if(('tag' in artworkData) and (artworkData['tag'] == artworkData['tag']) and (len(artworkData['tag']) > 0)):
        g.add((Artwork, PGT['tag'], Literal(artworkData['tag'].strip().lower().replace("/ ", "/"), datatype=XSD.string)))  

    addToG(Artwork, PGT['width'], artworkData, 'width', datatype=XSD.float)
    addToG(Artwork, PGT['length'], artworkData, 'length', datatype=XSD.float)
    addToG(Artwork, PGT['height'], artworkData, 'height', datatype=XSD.float)
    addToG(Artwork, PGT['diameter'], artworkData, 'diameter', datatype=XSD.float)
    addToG(Artwork, PGT['thickness'], artworkData, 'thickness', datatype=XSD.float)
    addToG(Artwork, PGT['depth'], artworkData, 'depth', datatype=XSD.float)





Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#float, Converter=<class 'float'>
Traceback (most recent call last):
  File "/home/luca-fabbian/.local/lib/python3.10/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
ValueError: could not convert string to float: '16-23'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#float, Converter=<class 'float'>
Traceback (most recent call last):
  File "/home/luca-fabbian/.local/lib/python3.10/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
ValueError: could not convert string to float: '8-11'


In [96]:


g.serialize(destination=savePath+"sparql.ttl", format='turtle')




<Graph identifier=N28b9996c3fb64a869fa8e4d9904caa28 (<class 'rdflib.graph.Graph'>)>