# Padova Grand Tour - Scraped Data Ingester

This notebook would read the data scraped and return a `scraped.ttl` file formatted according to our ontology.

Import Json files:

In [35]:
import os
from pathlib import Path
import re
import json

path = str(Path(os.path.abspath(os.getcwd())))

In [36]:
# Read Json files
with open(path + '/scraper/results/padovamuseicivici.json') as f:
    museicivici = json.load(f)

with open(path + '/scraper/results/wikipedia-categories.json') as f:
    locations = json.load(f)

Fix missing/broken data:

Setup graph:

In [37]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF, XSD, schema.org
from rdflib.namespace import FOAF, XSD, SDO
from rdflib.collection import Collection

# Main namespace
PGT = Namespace("https://padovagrandtour.github.io/entitites#")


# Saving folder
savePath =  path + "/data/ttlData/"
os.makedirs(savePath, exist_ok=True)


# Bind namespaces
g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("pgt", PGT)
g.bind("sdo", SDO)


Helpers:

In [38]:
# Extract data as as the year is roughly referring
import roman

def parseDate(date):
    date = date.replace('Il secolo', 'II secolo')

    date = date.replace('sec.', 'sec')
    date = date.replace('secolo', 'sec')
    date = date.replace('Sec.', 'sec')
    date = date.replace('Secolo', 'sec')
    date = date.replace('sec', '')
    date = date.strip()

    date = date.replace('A.C.', 'a.C.')
    date = date.replace('a. C.', 'a.C.')
    date = date.replace('d.C.', '')
    date = date.replace('d.C', '')
    date = date.replace(',', '')
    date = date.replace('Ll-lll', 'LI-III')
    date = date.replace('decenio', 'decennio')

    date = re.sub(r"\([^)]+\)", "", date)


    date = date.replace('L', 'I')
    date = date.replace('Il', 'II')
    date = date.replace('lll', 'III')
    date = date.replace('ll', 'II')

    date = ' '.join(date.split())
   

    #undetermined or missing
    if (date == ''): return float('nan')
    
    m = re.search('on determi', date)
    if m: return float('nan')

    # special encoded values
    if (date == 'Epoca romana'): return 100
    if (date == 'Epoca imperiale'): return 100
    if (date == 'Prima età imperiale'): return 40
    if (date == '1720-1730 montatura; 1820-1830 pagina'): return 1770
    if (date == 'Età tiberio-claudia - II'): return 150
    if (date == 'Epoca imperiale o tardo-antica'): return 40
    if (date == 'Età romana inizio'): return 40
    if (date == 'Tra il 27 a.C. e il 68'): return 40
    if (date == 'Tra il II a.C. e il 165 a.C.'): return 40

    if (date == 'Metà I - II'): return 1770
    if (date == 'Ultimo quarto I a.C. - metà I'): return 150
    if (date == 'Tra il fine I a.C. e il metà I'): return 40
    if (date == 'Fine I a.C. - prima metà I'): return 40
    if (date == 'Seconda metà I - II'): return 40

    



    m = re.search('^([XIV]+?) a.C. primo quarto$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 + 10)


    m = re.search('Nuovo Regno - ([XIV]+?) dinastia', date)
    if m: return (roman.fromRoman(m.group(1))* 100 + 10)

    m = re.search('Fine del I a.C. - inizio del I', date)
    if m: return 0

    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) fine/inizio$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^Prima metà ([XIV]+?) -prima metà ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) a.C.$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) a.C. fine/inizio$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    


    m = re.search('Metà I a.C. - metà I', date)
    if m: return 0

    
    m = re.search('^([XIV]+?) a.C. inizio$', date)
    if m: return (roman.fromRoman(m.group(1))* -100 - 10)

    m = re.search('^([XIV]+?) inizio$', date)
    if m: return (roman.fromRoman(m.group(1))* +100 + 10)

    m = re.search('^Metà ([XIV]+?)$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)

    m = re.search('^Fine ([XIV]+?) fine$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)
    m = re.search('^([XIV]+?) fine$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)


    m = re.search('Primo-ondo decennio ([XIV]+?)$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)



    m = re.search('^([XIV]+?)\-([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)) )* 100 /2)


    m = re.search('^([XIV]+?) a.C. prima metà$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)
    m = re.search('^([XIV]+?) a.C. metà$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)

    m = re.search('^([XIV]+?) a.C. onda metà$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)

    m = re.search('^([XIV]+?) a.C.\s?[/-]\s?([XIV]+?) fine/inizio$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)
    m = re.search('^([XIV]+?) a.C.\s?[/-]\s?([XIV]+?)$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)



    m = re.search('^([XIV]+?) a.C.$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* -100 - 10)

    m = re.search('^([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* 100 + 50)

    m = re.search('Tra il ([0-9]+?) e il ([0-9]+?)', date)
    if m: return ((int(m.group(1))  + int(m.group(2)))/2)


    m = re.search('^([XIV]+?) a.C. fine$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* -100 -10)



    m = re.search('^([XIV]+?) prima metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* 100 + 10)


    m = re.search('^([0-9]+?) a.C.$', date)
    if m: return ((int(m.group(1)) )*-100)

    m = re.search('^([0-9]+?)/([0-9]+?)/([0-9]+?)$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^([0-9]+?)\-([0-9]+?)$', date)
    if m: return ((int(m.group(1)) + int(m.group(2)) )/2)

    m = re.search('^Post ([0-9]+?)$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^([0-9]+?)$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^([0-9]+?) circa$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^Tra il ([0-9]+?) e il$', date)
    if m: return ((int(m.group(1)) ))



    m = re.search('^([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^([XIV]+?) primo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^([XIV]+?) ondo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))
    
    m = re.search('^([XIV]+?) terzo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^([XIV]+?) ultimo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^([XIV]+?) fine$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Fine ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^([XIV]+?) onda metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^([XIV]+?) metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^Metà ([XIV]+?) metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Seconda metà ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Dopo la metà del ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Fine ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))




    m = re.search('^([XIV]+?) prima metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 + 25))

    m = re.search('^([XIV]+?) miIIennio a.C. onda metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * -1000 + 25))

    # if no matches, print an error
    print("[ERROR] " + date)





Main "populate graph" loop:

In [40]:
def addToG(subject, predicate, obj, key, datatype):
    try: # Catch keyerrors
        if(obj[key] == obj[key]):   # check for NaN values
            if(datatype == XSD.string):
                g.add((subject, predicate, Literal(obj[key].strip(), datatype=datatype)))    
            else:
                g.add((subject, predicate, Literal(obj[key], datatype=datatype)))    
    except Exception as e:
        pass



MUSEUMindex = 0
COLLECTIONindex = 0
ARTWORKindex = 0

for museumURL, museumData in museicivici.items():
    # Setup museum node
    Museum = URIRef(PGT["MUSEUM" +  str(MUSEUMindex)])
    MUSEUMindex = MUSEUMindex + 1
    g.add((Museum, RDF.type, PGT.Museum))
    # Add museum properties
    g.add((Museum, SDO.url, Literal(museumURL, datatype=SDO.URL)))  
    addToG(Museum, SDO.name, museumData, 'name', datatype=XSD.string)
    addToG(Museum, SDO.description, museumData, 'description', datatype=RDF.HTML)
    addToG(Museum, SDO.image, museumData, 'img', datatype=SDO.URL)


    for collectionURL, collectionData in museumData['collections'].items():
        # Setup collection node
        MCollection = URIRef(PGT["COLLECTION" +  str(COLLECTIONindex)])
        COLLECTIONindex = COLLECTIONindex + 1
        g.add((MCollection, RDF.type, PGT.Collection))
        # Add Collection properties
        g.add((MCollection, SDO.url, Literal(collectionURL, datatype=SDO.URL)))
        g.add((MCollection, PGT['museum'], Museum))  
        addToG(MCollection, SDO.name, collectionData, 'name', datatype=XSD.string)
        addToG(MCollection, SDO.description, collectionData, 'description', datatype=RDF.HTML)
        addToG(MCollection, SDO.image, collectionData, 'img', datatype=SDO.URL)



        for artworkURL, artworkData in collectionData['artworks'].items():
            # Setup artwork node
            Artwork = URIRef(PGT["ARTWORK" +  str(ARTWORKindex)])
            ARTWORKindex = ARTWORKindex + 1
            g.add((Artwork, RDF.type, PGT.Artwork))
                        
            fields = artworkData['fields']


            # Simple fields
            g.add((Artwork, SDO.url, Literal(artworkURL, datatype=SDO.URL)))  
            g.add((Artwork, PGT['collection'], MCollection)) 
            addToG(Artwork, SDO.name, artworkData, 'name', datatype=XSD.string)
            addToG(Artwork, SDO.description, artworkData, 'description', datatype=RDF.HTML)
            addToG(Artwork, SDO.image, artworkData, 'img', datatype=SDO.URL)

            addToG(Artwork, PGT['width'], fields, 'Larghezza', datatype=XSD.float)
            addToG(Artwork, PGT['height'], fields, 'Altezza', datatype=XSD.float)
            addToG(Artwork, PGT['placing'], fields, 'Collocazione', datatype=XSD.string)
            addToG(Artwork, PGT['conservationState'], fields, 'Stato di conservazione', datatype=XSD.string)
            addToG(Artwork, SDO.author, fields, 'Autore', datatype=XSD.string)
            addToG(Artwork, SDO.material, fields, 'Materiale e Tecnica', datatype=XSD.string)

            # Parse date
            parsedDate = parseDate(fields['Cronologia'])
            if( parsedDate == parsedDate):
                g.add((Artwork, PGT['yearCreatedText'], Literal(fields['Cronologia'], datatype=XSD.string)))
                g.add((Artwork, PGT['yearCreated'], Literal(parseDate(fields['Cronologia']), datatype=XSD.year)))
            # Tags
            if(artworkData['tags'] == artworkData['tags']):   # check for NaN values
                for tag in artworkData['tags']:
                    g.add((Artwork, PGT['tag'], Literal(tag.strip().lower(), datatype=XSD.string)))  

            #collectionNode = URIRef(PGT["ARTWORK" +  str(ARTWORKindex) + "TAGS"])
            #collection = g.collection(collectionNode)
            #print("ARTWORK" +  str(ARTWORKindex) + "TAGS", [x for x in row['tags']] )
            #collection += [Literal(x) for x in row['tags']]
            #g.add((Artwork, PGT['tags'],  collectionNode))

            # g.add((Artwork, PGT['tags'], Literal(row['tags'], datatype=XSD.string))) 

g.serialize(destination=savePath+"scraped.ttl", format='turtle')


<Graph identifier=N1a3b115a8ae3458caf6a55f594ecb22e (<class 'rdflib.graph.Graph'>)>