# Padova Grand Tour - Scraped Data Ingester

This notebook will read the scraped data stored inside `scraper/results` and generate a `scraped.ttl` turtle file formatted according to our ontology.

Import Json files:

In [1]:
import os
from pathlib import Path
import re
import json

path = str(Path(os.path.abspath((os.getcwd()))).parent)

In [2]:
# Read Json files
with open(path + '/scraper/results/padovamuseicivici.json') as f:
    museicivici = json.load(f)

with open(path + '/scraper/results/wikipedia-categories.json') as f:
    wikicategories = json.load(f)

Fix missing/broken data:

In [3]:
# some missing data
manualLocations = {}
manualLocations['latitude']=['45°24′42.54″N','45°24′39.46″N','45°24′39.46″N','45°24′41.65″N','45°24′28.25″N']
manualLocations['longitude']=['11°52′46.33″E','11°52′48.04″E','11°52′48.04″E','11°52′41.23″E','11°52′36.93″E']

# Overlapping museums
siteOverlaps = {
    'https://padovamusei.it/it/musei/museo-bottacin': 'https://it.wikipedia.org/wiki/Museo_Bottacin'
}
siteNodes = {}


Setup graph:

In [4]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF, XSD, schema.org
from rdflib.namespace import FOAF, XSD, SDO
from rdflib.collection import Collection

# Main namespace
PGT = Namespace("https://padovagrandtour.github.io/entitites#")
# Be careful! the "simple GEO" namespace is not the same as the "advanced GEO" namespace exported by rdflib
GEO = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")

# Saving folder
savePath =  path + "/data/ttlData/"
os.makedirs(savePath, exist_ok=True)


# Bind namespaces
g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("pgt", PGT)
g.bind("sdo", SDO)
g.bind("geo", GEO)


Helpers:

In [5]:
# Extract data as as the year is roughly referring
import roman

def parseDate(date):
    date = date.replace('Il secolo', 'II secolo')

    date = date.replace('sec.', 'sec')
    date = date.replace('secolo', 'sec')
    date = date.replace('Sec.', 'sec')
    date = date.replace('Secolo', 'sec')
    date = date.replace('sec', '')
    date = date.strip()

    date = date.replace('A.C.', 'a.C.')
    date = date.replace('a. C.', 'a.C.')
    date = date.replace('d.C.', '')
    date = date.replace('d.C', '')
    date = date.replace(',', '')
    date = date.replace('Ll-lll', 'LI-III')
    date = date.replace('decenio', 'decennio')

    date = re.sub(r"\([^)]+\)", "", date)


    date = date.replace('L', 'I')
    date = date.replace('Il', 'II')
    date = date.replace('lll', 'III')
    date = date.replace('ll', 'II')

    date = ' '.join(date.split())
   

    #undetermined or missing
    if (date == ''): return float('nan')
    
    m = re.search('on determi', date)
    if m: return float('nan')

    # special encoded values
    if (date == 'Epoca romana'): return 100
    if (date == 'Epoca imperiale'): return 100
    if (date == 'Prima età imperiale'): return 40
    if (date == '1720-1730 montatura; 1820-1830 pagina'): return 1770
    if (date == 'Età tiberio-claudia - II'): return 150
    if (date == 'Epoca imperiale o tardo-antica'): return 40
    if (date == 'Età romana inizio'): return 40
    if (date == 'Tra il 27 a.C. e il 68'): return 40
    if (date == 'Tra il II a.C. e il 165 a.C.'): return -190

    if (date == 'Metà I - II'): return 180
    if (date == 'Ultimo quarto I a.C. - metà I'): return 10
    if (date == 'Tra il fine I a.C. e il metà I'): return 40
    if (date == 'Fine I a.C. - prima metà I'): return 40
    if (date == 'Seconda metà I - II'): return 180

    



    m = re.search('^([XIV]+?) a.C. primo quarto$', date)
    if m: return (roman.fromRoman(m.group(1))* -100 + 10)


    m = re.search('Nuovo Regno - ([XIV]+?) dinastia', date)
    if m: return (roman.fromRoman(m.group(1))* 100 + 10)

    m = re.search('Fine del I a.C. - inizio del I', date)
    if m: return 0

    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) fine/inizio$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^Prima metà ([XIV]+?) -prima metà ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) a.C.$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) a.C. fine/inizio$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    


    m = re.search('Metà I a.C. - metà I', date)
    if m: return 0

    
    m = re.search('^([XIV]+?) a.C. inizio$', date)
    if m: return (roman.fromRoman(m.group(1))* -100 + 10)

    m = re.search('^([XIV]+?) inizio$', date)
    if m: return (roman.fromRoman(m.group(1))* +100 - 90)

    m = re.search('^Metà ([XIV]+?)$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 50)

    m = re.search('^Fine ([XIV]+?) fine$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)
    m = re.search('^([XIV]+?) fine$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)


    m = re.search('Primo-ondo decennio ([XIV]+?)$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 85)



    m = re.search('^([XIV]+?)\-([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)) )* 100 /2)


    m = re.search('^([XIV]+?) a.C. prima metà$', date)
    if m: return (roman.fromRoman(m.group(1))* -100 + 25)
    m = re.search('^([XIV]+?) a.C. metà$', date)
    if m: return (roman.fromRoman(m.group(1))* -100 + 50)

    m = re.search('^([XIV]+?) a.C. onda metà$', date)
    if m: return (roman.fromRoman(m.group(1))* -100 + 75)

    m = re.search('^([XIV]+?) a.C.\s?[/-]\s?([XIV]+?) fine/inizio$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)) )* -100 /2)
    m = re.search('^([XIV]+?) a.C.\s?[/-]\s?([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)) )* -100 /2)



    m = re.search('^([XIV]+?) a.C.$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* -100 + 50)

    m = re.search('^([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* 100 - 50)

    m = re.search('Tra il ([0-9]+?) e il ([0-9]+?)', date)
    if m: return ((int(m.group(1))  + int(m.group(2)))/2)


    m = re.search('^([XIV]+?) a.C. fine$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* -100 + 90)



    m = re.search('^([XIV]+?) prima metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* 100 - 75)


    m = re.search('^([0-9]+?) a.C.$', date)
    if m: return (((int(m.group(1)) )-1) * -100)

    m = re.search('^([0-9]+?)/([0-9]+?)/([0-9]+?)$', date)
    if m: return ((int(m.group(1)) - 1) * 100)

    m = re.search('^([0-9]+?)\-([0-9]+?)$', date)
    if m: return ((int(m.group(1)) + int(m.group(2)) * 100)/2)

    m = re.search('^Post ([0-9]+?)$', date)
    if m: return ((int(m.group(1))* 100 ))

    m = re.search('^([0-9]+?)$', date)
    if m: return ((int(m.group(1)) - 1) * 100)

    m = re.search('^([0-9]+?) circa$', date)
    if m: return ((int(m.group(1)) - 1) * 100)

    m = re.search('^Tra il ([0-9]+?) e il$', date)
    if m: return ((int(m.group(1)) * 100))



    m = re.search('^([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 - 50))


    m = re.search('^([XIV]+?) primo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 - 90))

    m = re.search('^([XIV]+?) ondo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 - 65))
    
    m = re.search('^([XIV]+?) terzo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 - 40))

    m = re.search('^([XIV]+?) ultimo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 - 15))


    m = re.search('^([XIV]+?) fine$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 - 10))

    m = re.search('^Fine ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 - 10))


    m = re.search('^([XIV]+?) onda metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^([XIV]+?) metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^Metà ([XIV]+?) metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Seconda metà ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Dopo la metà del ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 - 25))

    m = re.search('^Fine ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 -10))




    m = re.search('^([XIV]+?) prima metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 - 75))

    m = re.search('^([XIV]+?) miIIennio a.C. onda metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * -1000 + 750))

    # if no matches, print an error
    print("[ERROR] " + date)





In [6]:
# Add to graph while doing standard normalizations
def addToG(subject, predicate, obj, key, datatype):
    try: # Catch keyerrors
        if(obj[key] == obj[key]):   # check for NaN values
            if(datatype == XSD.string):
                g.add((subject, predicate, Literal(obj[key].strip(), datatype=datatype)))    
            else:
                g.add((subject, predicate, Literal(obj[key], datatype=datatype)))    
    except Exception as e:
        pass

In [7]:
#Convert latitude/longitude from degrees to float
def deg2float(lat):
    deg, minutes, seconds, direction =  re.split('[°′″]', lat)
    return (float(deg) + float(minutes)/60 + float(seconds)/(60*60)) * (-1 if direction in ['O', 'S'] else 1)


Main "populate graph" loop:

In [8]:

SITEindex = 0
COLLECTIONindex = 0
ARTWORKindex = 0


for museumURL, museumData in museicivici.items():
    # Setup museum node
    Museum = URIRef(PGT["SITE" +  str(SITEindex)])
    SITEindex = SITEindex + 1
    if(museumURL == 'https://cappellascrovegni.padovamusei.it'):
        g.add((Museum, PGT['hasCategory'], PGT["CSCategoryChurch"]))
    else:
        g.add((Museum, PGT['hasCategory'], PGT["CSCategoryMuseum"]))

    # Add museum tag to overlaps
    if(museumURL in siteOverlaps.keys()):
        siteNodes[siteOverlaps[museumURL]] = Museum
    # Add museum properties
    g.add((Museum, SDO.url, Literal(museumURL, datatype=SDO.URL)))  
    g.add((Museum, RDF.type, PGT["CulturalSite"]))

    addToG(Museum, SDO.name, museumData, 'name', datatype=XSD.string)
    addToG(Museum, SDO.description, museumData, 'description', datatype=RDF.HTML)
    addToG(Museum, SDO.image, museumData, 'img', datatype=SDO.URL)
    # Location
    g.add((Museum, GEO['lat'], Literal(float(deg2float(manualLocations['latitude'][SITEindex -1])), datatype=XSD.float)))  
    g.add((Museum, GEO['long'], Literal(float(deg2float(manualLocations['longitude'][SITEindex -1])), datatype=XSD.float)))  



    for collectionURL, collectionData in museumData['collections'].items():
        # Setup collection node
        MCollection = URIRef(PGT["COLLECTION" +  str(COLLECTIONindex)])
        COLLECTIONindex = COLLECTIONindex + 1
        g.add((MCollection, RDF.type, PGT.Collection))
        # Add Collection properties
        g.add((MCollection, SDO.url, Literal(collectionURL, datatype=SDO.URL)))
        addToG(MCollection, SDO.name, collectionData, 'name', datatype=XSD.string)
        addToG(MCollection, SDO.description, collectionData, 'description', datatype=RDF.HTML)
        addToG(MCollection, SDO.image, collectionData, 'img', datatype=SDO.URL)



        for artworkURL, artworkData in collectionData['artworks'].items():
            # Setup artwork node
            Artwork = URIRef(PGT["ARTWORK" +  str(ARTWORKindex)])
            ARTWORKindex = ARTWORKindex + 1
            g.add((Artwork, RDF.type, PGT.Artwork))
                        
            fields = artworkData['fields']


            # Simple fields
            g.add((Artwork, SDO.url, Literal(artworkURL, datatype=SDO.URL)))  
            g.add((Artwork, PGT['hasCollection'], MCollection)) 
            g.add((Artwork, PGT['hasSite'], Museum))  
            addToG(Artwork, SDO.name, artworkData, 'name', datatype=XSD.string)
            addToG(Artwork, SDO.description, artworkData, 'description', datatype=RDF.HTML)
            addToG(Artwork, SDO.image, artworkData, 'img', datatype=SDO.URL)
            addToG(Artwork, PGT['placing'], fields, 'Collocazione', datatype=XSD.string)
            addToG(Artwork, PGT['conservationState'], fields, 'Stato di conservazione', datatype=XSD.string)
            addToG(Artwork, SDO.author, fields, 'Autore', datatype=XSD.string)
            addToG(Artwork, SDO.material, fields, 'Materiale e Tecnica', datatype=XSD.string)

            # Parse date
            parsedDate = parseDate(fields['Cronologia'])
            if( parsedDate == parsedDate):
                g.add((Artwork, PGT['yearCreatedText'], Literal(fields['Cronologia'], datatype=XSD.string)))
                g.add((Artwork, PGT['yearCreated'], Literal(parsedDate, datatype=XSD.year)))
            # Tags
            if(artworkData['tags'] == artworkData['tags']):   # check for NaN values
                for tag in artworkData['tags']:
                    g.add((Artwork, PGT['tag'], Literal(tag.strip().lower().replace("/ ", "/"), datatype=XSD.string)))  
            # Dimensions
            try:
                if(fields['Dimensioni'] == fields['Dimensioni']):
                    g.add((Artwork, PGT['dimensionsText'], Literal(fields['Dimensioni'], datatype=XSD.string)))
                    for (dimName, dimValue) in re.findall(r'([^\s]+)\s*:\s*(\d+\.?\d*)', fields['Dimensioni']):
                        dimName = dimName.strip().lower()
                        if('larghezza' in dimName):
                            g.add((Artwork, PGT['width'], Literal(float(dimValue), datatype=XSD.float)))  
                        if('lunghezza' in dimName):
                            g.add((Artwork, PGT['length'], Literal(float(dimValue), datatype=XSD.float)))  
                        if('altezza' in dimName):
                            g.add((Artwork, PGT['height'], Literal(float(dimValue), datatype=XSD.float)))  
                        if('diametro' in dimName):
                            g.add((Artwork, PGT['diameter'], Literal(float(dimValue), datatype=XSD.float)))  
                        if('spessore' in dimName):
                            g.add((Artwork, PGT['thickness'], Literal(float(dimValue), datatype=XSD.float)))  
                        if('profondità' in dimName):
                            g.add((Artwork, PGT['depth'], Literal(float(dimValue), datatype=XSD.float)))  


            except: pass




Add locations and other sites:

In [9]:
categoryNames = {
    'https://it.wikipedia.org/wiki/Categoria:Musei_di_Padova':'Museum', 
    'https://it.wikipedia.org/wiki/Categoria:Castelli_di_Padova':'Castle', 
    'https://it.wikipedia.org/wiki/Categoria:Chiese_di_Padova':'Church', 
    'https://it.wikipedia.org/wiki/Categoria:Basiliche_di_Padova':'Basilica',
    'https://it.wikipedia.org/wiki/Categoria:Mura_e_porte_di_Padova':'WallOrGate',
    'https://it.wikipedia.org/wiki/Categoria:Palazzi_di_Padova':'Palace',
    'https://it.wikipedia.org/wiki/Categoria:Logge_di_Padova':'Loggia',
    'https://it.wikipedia.org/wiki/Categoria:Ponti_di_Padova':'Bridge'
}


for categoryURL, categoryData in wikicategories.items():
    for siteURL, siteData in categoryData.items():
        if('andbox' in siteURL): continue
        if(siteURL in siteOverlaps.values()):
            Site = siteNodes[siteURL]
        else:
            Site = URIRef(PGT["SITE" +  str(SITEindex)])
            SITEindex = SITEindex + 1
            g.add((Site, RDF.type, PGT["CulturalSite"]))
            # Add a category if a mapping exists
            try: g.add((Site, PGT['hasCategory'], PGT["CSCategory" + categoryNames[categoryURL]]))
            except: pass

        g.add((Site, SDO.url, Literal(siteURL, datatype=SDO.URL)))  
        addToG(Site, SDO.name, siteData, 'name', datatype=XSD.string)
        addToG(Site, SDO.description, siteData, 'description', datatype=RDF.HTML)
        if('latitude' in siteData.keys()):
            g.add((Site, GEO['lat'], Literal(float(deg2float(siteData['latitude'])), datatype=XSD.float)))  
        if('longitude' in siteData.keys()):
            g.add((Site, GEO['long'], Literal(float(deg2float(siteData['longitude'])), datatype=XSD.float)))  

        addToG(Site, SDO.image, siteData, 'img', datatype=SDO.URL)


        


SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' (4050894707.py, line 23)

Save data:

In [None]:
g.serialize(destination=savePath+"scraped.ttl", format='turtle')

<Graph identifier=N5dbf2dacfc2d4bd191adcaae3659c3fd (<class 'rdflib.graph.Graph'>)>