# Padova Grand Tour - Scraped Data Ingester

This notebook would read the data scraped and return a `scraped.ttl` file formatted according to our ontology.

Import Json files:

In [5]:
import os
from pathlib import Path
import re
import json

path = str(Path(os.path.abspath(os.getcwd())))

In [6]:
# Read Json files
with open(path + '/scraper/results/padovamuseicivici.json') as f:
    museicivici = json.load(f)

with open(path + '/scraper/results/wikipedia-categories.json') as f:
    locations = json.load(f)

Fix missing/broken data:

Setup graph:

In [7]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF, XSD, schema.org
from rdflib.namespace import FOAF, XSD, SDO
from rdflib.collection import Collection

# Main namespace
PGT = Namespace("https://padovagrandtour.github.io/entitites#")


# Saving folder
savePath =  path + "/data/ttlData/"
os.makedirs(savePath, exist_ok=True)


# Bind namespaces
g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("pgt", PGT)


Helpers:

In [8]:
# Extract data as as the year is roughly referring
import roman

def parseDate(date):
    date = date.replace('Il secolo', 'II secolo')

    date = date.replace('sec.', 'sec')
    date = date.replace('secolo', 'sec')
    date = date.replace('Sec.', 'sec')
    date = date.replace('Secolo', 'sec')
    date = date.replace('sec', '')
    date = date.strip()

    date = date.replace('A.C.', 'a.C.')
    date = date.replace('a. C.', 'a.C.')
    date = date.replace('d.C.', '')
    date = date.replace('d.C', '')
    date = date.replace(',', '')
    date = date.replace('Ll-lll', 'LI-III')
    date = date.replace('decenio', 'decennio')

    date = re.sub(r"\([^)]+\)", "", date)


    date = date.replace('L', 'I')
    date = date.replace('Il', 'II')
    date = date.replace('lll', 'III')
    date = date.replace('ll', 'II')

    date = ' '.join(date.split())
   

    #undetermined or missing
    if (date == ''): return float('nan')
    
    m = re.search('on determi', date)
    if m: return float('nan')

    # special encoded values
    if (date == 'Epoca romana'): return 100
    if (date == 'Epoca imperiale'): return 100
    if (date == 'Prima età imperiale'): return 40
    if (date == '1720-1730 montatura; 1820-1830 pagina'): return 1770
    if (date == 'Età tiberio-claudia - II'): return 150
    if (date == 'Epoca imperiale o tardo-antica'): return 40
    if (date == 'Età romana inizio'): return 40
    if (date == 'Tra il 27 a.C. e il 68'): return 40
    if (date == 'Tra il II a.C. e il 165 a.C.'): return 40

    if (date == 'Metà I - II'): return 1770
    if (date == 'Ultimo quarto I a.C. - metà I'): return 150
    if (date == 'Tra il fine I a.C. e il metà I'): return 40
    if (date == 'Fine I a.C. - prima metà I'): return 40
    if (date == 'Seconda metà I - II'): return 40

    



    m = re.search('^([XIV]+?) a.C. primo quarto$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 + 10)


    m = re.search('Nuovo Regno - ([XIV]+?) dinastia', date)
    if m: return (roman.fromRoman(m.group(1))* 100 + 10)

    m = re.search('Fine del I a.C. - inizio del I', date)
    if m: return 0

    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) fine/inizio$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^Prima metà ([XIV]+?) -prima metà ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) a.C.$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) a.C. fine/inizio$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    


    m = re.search('Metà I a.C. - metà I', date)
    if m: return 0

    
    m = re.search('^([XIV]+?) a.C. inizio$', date)
    if m: return (roman.fromRoman(m.group(1))* -100 - 10)

    m = re.search('^([XIV]+?) inizio$', date)
    if m: return (roman.fromRoman(m.group(1))* +100 + 10)

    m = re.search('^Metà ([XIV]+?)$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)

    m = re.search('^Fine ([XIV]+?) fine$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)
    m = re.search('^([XIV]+?) fine$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)


    m = re.search('Primo-ondo decennio ([XIV]+?)$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)



    m = re.search('^([XIV]+?)\-([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)) )* 100 /2)


    m = re.search('^([XIV]+?) a.C. prima metà$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)
    m = re.search('^([XIV]+?) a.C. metà$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)

    m = re.search('^([XIV]+?) a.C. onda metà$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)

    m = re.search('^([XIV]+?) a.C.\s?[/-]\s?([XIV]+?) fine/inizio$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)
    m = re.search('^([XIV]+?) a.C.\s?[/-]\s?([XIV]+?)$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)



    m = re.search('^([XIV]+?) a.C.$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* -100 - 10)

    m = re.search('^([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* 100 + 50)

    m = re.search('Tra il ([0-9]+?) e il ([0-9]+?)', date)
    if m: return ((int(m.group(1))  + int(m.group(2)))/2)


    m = re.search('^([XIV]+?) a.C. fine$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* -100 -10)



    m = re.search('^([XIV]+?) prima metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* 100 + 10)


    m = re.search('^([0-9]+?) a.C.$', date)
    if m: return ((int(m.group(1)) )*-100)

    m = re.search('^([0-9]+?)/([0-9]+?)/([0-9]+?)$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^([0-9]+?)\-([0-9]+?)$', date)
    if m: return ((int(m.group(1)) + int(m.group(2)) )/2)

    m = re.search('^Post ([0-9]+?)$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^([0-9]+?)$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^([0-9]+?) circa$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^Tra il ([0-9]+?) e il$', date)
    if m: return ((int(m.group(1)) ))



    m = re.search('^([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^([XIV]+?) primo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^([XIV]+?) ondo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))
    
    m = re.search('^([XIV]+?) terzo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^([XIV]+?) ultimo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^([XIV]+?) fine$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Fine ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^([XIV]+?) onda metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^([XIV]+?) metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^Metà ([XIV]+?) metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Seconda metà ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Dopo la metà del ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Fine ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))




    m = re.search('^([XIV]+?) prima metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 + 25))

    m = re.search('^([XIV]+?) miIIennio a.C. onda metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * -1000 + 25))

    # if no matches, print an error
    print("[ERROR] " + date)





Main "populate graph" loop:

In [11]:
MUSEUMindex = 0
COLLECTIONindex = 0
ARTWORKindex = 0

for museumURL, museumData in museicivici.items():
    # Setup museum node
    Museum = URIRef(PGT["MUSEUM" +  str(MUSEUMindex)])
    MUSEUMindex = MUSEUMindex + 1
    g.add((Museum, RDF.type, PGT.Museum))


    # Add museum properties
    g.add((Museum, PGT['url'], Literal(museumURL, datatype=XSD.string)))   
    g.add((Museum, PGT['name'], Literal(museumData['name'], datatype=XSD.string)))   
    g.add((Museum, PGT['description'], Literal(museumData['description'], datatype=XSD.string)))    
    g.add((Museum, PGT['webImageAdress'], Literal(museumData['img'], datatype=XSD.string)))   

    for collectionURL, collectionData in museumData['collections'].items():

        for artworkURL, row in collectionData['artworks'].items():
            # Setup museum node
            Artwork = URIRef(PGT["ARTWORK" +  str(ARTWORKindex)])
            ARTWORKindex = ARTWORKindex + 1
            g.add((Artwork, RDF.type, PGT.Artwork))

            # Date
            parsedDate = parseDate(row['Cronologia'])

            #Check if date is valid
            if( parsedDate == parsedDate):
                # create node and add date properties
                g.add((Artwork, PGT['yearCreatedText'], Literal(row['Cronologia'], datatype=XSD.string)))
                g.add((Artwork, PGT['yearCreated'], Literal(parseDate(row['Cronologia']), datatype=XSD.year)))


            if(row['name'] == row['name']):   # check for NaN values
                g.add((Artwork, SDO['name'], Literal(row['name'], datatype=XSD.string)))    

            if(row['tags'] == row['tags']):   # check for NaN values
                for tag in row['tags']:
                    g.add((Artwork, PGT['tag'], Literal(tag.strip(), datatype=XSD.string)))  

                #collectionNode = URIRef(PGT["ARTWORK" +  str(ARTWORKindex) + "TAGS"])
                #collection = g.collection(collectionNode)
                #print("ARTWORK" +  str(ARTWORKindex) + "TAGS", [x for x in row['tags']] )
                #collection += [Literal(x) for x in row['tags']]
                #g.add((Artwork, PGT['tags'],  collectionNode))

                # g.add((Artwork, PGT['tags'], Literal(row['tags'], datatype=XSD.string))) 

            if(row['Altezza'] == row['Altezza']):   # check for NaN values
                g.add((Artwork, PGT['height'], Literal(row['Altezza'], datatype=XSD.float)))  

            if(row['Larghezza'] == row['Larghezza']):   # check for NaN values
                g.add((Artwork, PGT['width'], Literal(row['Larghezza'], datatype=XSD.float)))  

            if(row['Collocazione'] == row['Collocazione']):   # check for NaN values
                g.add((Artwork, PGT['placing'], Literal(row['Collocazione'], datatype=XSD.string))) 

            if(row['Autore'] == row['Autore']):   # check for NaN values
                g.add((Artwork, SDO['author'], Literal(row['Autore'], datatype=XSD.string)))    

            if(row['Stato di conservazione'] == row['Stato di conservazione']):   # check for NaN values
                g.add((Artwork, PGT['conservationState'], Literal(row['Stato di conservazione'], datatype=XSD.string)))  
                
            if(row['Materiale e Tecnica'] == row['Materiale e Tecnica']):   # check for NaN values
                g.add((Artwork, PGT['technique'], Literal(row['Materiale e Tecnica'], datatype=XSD.string)))

            if(row['img'] == row['img']):   # check for NaN values
                g.add((Artwork, SDO['image'], Literal(row['img'], datatype=XSD.string)))   

            if(row['description'] == row['description']):   # check for NaN values
                g.add((Artwork, SDO['description'], Literal(row['description'], datatype=XSD.string)))    


g.serialize(destination=savePath+"scraped.ttl", format='turtle')


KeyError: 'Cronologia'