# Padova Grand Tour Ontology

Luca FABBIAN, Loic DUPUY-KERANGUEVEN, Jean LE CHEVALIER

As a group mostly composed of 2 Erasmus students and one italian student, our idea was to create an ontology related to tourism, specifically in Padova. The main goal was to create and query some "Tours" regarding the artworks/cultural site they would contain. In order to do so, we scrapped data from https://padovamusei.it/ and wikipedia. The scrapper returned a big Json file that required data processing in order to get csv files that would be easier to use for the rdf database creation.

## Data processing
### Scrapped Json to csv's

In [88]:
# required libraries
import pandas as pd
import os
from pathlib import Path
import re

path = str(Path(os.path.abspath(os.getcwd())))

museumsUrl = path + '/data/jsonData/padovamuseicivici.json'
locationsUrl = path + '/data/jsonData/wikipedia-categories.json'

In [89]:
#Read entire Json (dictionnary of multiple dictionnaries)
museums=pd.read_json(museumsUrl)

#we want to flatten the "collections" dict which contains artworks info
collections=museums.loc["collections",:]

In [90]:
artworks={}

#Extracting the Artworks dictionnary for each museum
for collec in collections:
    for key, value in collec.items():
        #print(value.get('artworks'))
        artworks.update(value.get('artworks'))
        
df_artworks=pd.DataFrame(artworks)
df_artworks=df_artworks.T
df_artworks=pd.concat([df_artworks.drop(['fields'], axis=1), df_artworks['fields'].apply(pd.Series)], axis=1)
df_artworks['Url']=df_artworks.index

#Each row is an artwork, each column is a data property
df_artworks.reset_index(drop=True, inplace=True)

#Creating the museumName column using regex
df_artworks['museumName']=df_artworks.apply(lambda row : re.search('musei/(.*)/collezioni',row['Url']),axis=1)

for index, row in df_artworks.iterrows():
    if row['museumName']:
        row['museumName']=row['museumName'].group(1)
    else:
        row['museumName']="capellascrovegni"

df_artworks.head(1)

Unnamed: 0,name,img,description,tags,Cronologia,Materiale e Tecnica,Dimensioni,Collocazione,Inventario,Stato di conservazione,...,Specifiche di reperimento,Osservazioni,Dati di scavo,Autore,Dritto,Rovescio,Notizie storico-critiche,Zecca,Url,museumName
0,Gioacchino cacciato dal Tempio,https://cappellascrovegni.padovamusei.it/sites...,"Si narra nei Vangeli apocrifi che Gioacchino, ...",[Dipinto],"XIV, inizio sec.",Affresco,Altezza: 208 cm ca.Larghezza: 220 cm ca.,"Navata, parete destra, registro superiore, I r...",S.n.,Buono,...,,,,,,,,,https://cappellascrovegni.padovamusei.it/it/co...,capellascrovegni


In [91]:
#Regex to extract different dimensions from String Dimensioni column
df_dimensions = df_artworks["Dimensioni"].str.extractall(r'([^\s"]+)\s*:\s*(\d+\.?\d*)').droplevel(1)
df_dimensions = df_dimensions.pivot(columns=0, values=1)
df_dimensions.columns.name = None
df_dimensions=df_dimensions.fillna(0)
df_dimensions[['Larghezza','ca.Larghezza']]=df_dimensions[['Larghezza','ca.Larghezza']].apply(pd.to_numeric, errors='coerce', axis=1)
df_dimensions['Larghezza']=df_dimensions['Larghezza']+df_dimensions['ca.Larghezza']
df_dimensions.drop(columns=['(gr.)', 'ca.Larghezza','cm'], inplace=True)

df_artworks=pd.concat([df_artworks, df_dimensions], axis=1)

In [92]:
#Museums Dataframe
df_museums=museums.T
df_museums=df_museums[['name','img','description']]
df_museums['latitude']=['45°24′42.54″N','45°24′39.46″N','45°24′39.46″N','45°24′41.65″N','45°24′28.25″N']
df_museums['longitude']=['11°52′46.33″E','11°52′48.04″E','11°52′48.04″E','11°52′41.23″E','11°52′36.93″E']

In [93]:
#Other Classes Dataframes
locations=pd.read_json(locationsUrl)
locations.columns = ['Museums', 'Archi', 'Castles', 'Churches', 'Basilicas','otherMonuments','Walls','Palaces','Logge','Bridges']

#Keeping columns corresponding to our classes ontology
museumsLoc=locations['Museums']
churchesLoc=locations['Churches']
basilicasLoc=locations['Basilicas']
palacesLoc=locations['Palaces']

museumsLoc=pd.json_normalize(museumsLoc)
churchesLoc=pd.json_normalize(churchesLoc)
basilicasLoc=pd.json_normalize(basilicasLoc)
palacesLoc=pd.json_normalize(palacesLoc)

dfs=[museumsLoc,churchesLoc,basilicasLoc,palacesLoc]

for df in dfs:
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)

### Saving dataframes to csv

In [94]:
#Saving all dfs to csv
cvsPath = path + "/data/csvData/"
os.makedirs(cvsPath, exist_ok=True)

museumsLoc.to_csv( cvsPath + "museumsLoc.csv",encoding='utf-8')
churchesLoc.to_csv( cvsPath + "churchesLoc.csv",encoding='utf-8')
basilicasLoc.to_csv( cvsPath + "basilicasLoc.csv",encoding='utf-8')
palacesLoc.to_csv( cvsPath + "palacesLoc.csv",encoding='utf-8')

df_artworks.to_csv( cvsPath + "artworks.csv", encoding='utf-8')
df_museums.to_csv( cvsPath + "museums.csv",encoding='utf-8')

# Creating database

In [95]:
import pandas as pd
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
from pathlib import Path
import os

path = str(Path(os.path.abspath(os.getcwd())))

PGT = Namespace("https://padovagrandtour.github.io/entitites#")


# saving folder
savePath =  path + "/data/ttlData/"
os.makedirs(savePath, exist_ok=True)


g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("pgt", PGT)

In [96]:
"""
Read all csv files (not doing it here, using dataframes of data processing)

artworks=pd.read_csv("data\\csvData\\artworks.csv",encoding="utf-8", index_col=0)
...

"""

'\nRead all csv files (not doing it here, using dataframes of data processing)\n\nartworks=pd.read_csv("data\\csvData\\artworks.csv",encoding="utf-8", index_col=0)\n...\n\n'

In [173]:
#ARTWORKS
import roman

def parseDate(date):
    date = date.replace('Il secolo', 'II secolo')

    date = date.replace('sec.', 'sec')
    date = date.replace('secolo', 'sec')
    date = date.replace('Sec.', 'sec')
    date = date.replace('Secolo', 'sec')
    date = date.replace('sec', '')
    date = date.strip()

    date = date.replace('A.C.', 'a.C.')
    date = date.replace('a. C.', 'a.C.')
    date = date.replace('d.C.', '')
    date = date.replace('d.C', '')
    date = date.replace(',', '')
    date = date.replace('Ll-lll', 'LI-III')
    date = date.replace('decenio', 'decennio')

    date = re.sub(r"\([^)]+\)", "", date)


    date = date.replace('L', 'I')
    date = date.replace('Il', 'II')
    date = date.replace('lll', 'III')
    date = date.replace('ll', 'II')

    date = ' '.join(date.split())
   

    #undetermined or missing
    if (date == ''): return float('nan')
    
    m = re.search('on determi', date)
    if m: return float('nan')

    # special encoded values
    if (date == 'Epoca romana'): return 100
    if (date == 'Epoca imperiale'): return 100
    if (date == 'Prima età imperiale'): return 40
    if (date == '1720-1730 montatura; 1820-1830 pagina'): return 1770
    if (date == 'Età tiberio-claudia - II'): return 150
    if (date == 'Epoca imperiale o tardo-antica'): return 40
    if (date == 'Età romana inizio'): return 40


    m = re.search('^([XIV]+?) a.C. primo quarto$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 + 10)


    m = re.search('Nuovo Regno - ([XIV]+?) dinastia', date)
    if m: return (roman.fromRoman(m.group(1))* 100 + 10)

    m = re.search('Fine del I a.C. - inizio del I', date)
    if m: return 0

    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) fine/inizio$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^Prima metà ([XIV]+?) -prima metà ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) a.C.$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    
    m = re.search('^([XIV]+?)\s?[/-]\s?([XIV]+?) a.C. fine/inizio$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)))* 100 / 2)    


    m = re.search('Metà I a.C. - metà I', date)
    if m: return 0

    
    m = re.search('^([XIV]+?) a.C. inizio$', date)
    if m: return (roman.fromRoman(m.group(1))* -100 - 10)

    m = re.search('^([XIV]+?) inizio$', date)
    if m: return (roman.fromRoman(m.group(1))* +100 + 10)

    m = re.search('^Metà ([XIV]+?)$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)

    m = re.search('^Fine ([XIV]+?) fine$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)
    m = re.search('^([XIV]+?) fine$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)


    m = re.search('Primo-ondo decennio ([XIV]+?)$', date)
    if m: return (roman.fromRoman(m.group(1))* 100 - 25)



    m = re.search('^([XIV]+?)\-([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) + roman.fromRoman(m.group(2)) )* 100 /2)


    m = re.search('^([XIV]+?) a.C.$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* -100 - 10)

    m = re.search('^([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* 100 + 50)

    m = re.search('Tra il ([0-9]+?) e il ([0-9]+?)', date)
    if m: return ((int(m.group(1))  + int(m.group(2)))/2)


    m = re.search('^([XIV]+?) a.C. fine$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* -100 -10)



    m = re.search('^([XIV]+?) prima metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) )* 100 + 10)


    m = re.search('^([0-9]+?) a.C.$', date)
    if m: return ((int(m.group(1)) )*-100)

    m = re.search('^([0-9]+?)/([0-9]+?)/([0-9]+?)$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^([0-9]+?)\-([0-9]+?)$', date)
    if m: return ((int(m.group(1)) + int(m.group(2)) )/2)

    m = re.search('^Post ([0-9]+?)$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^([0-9]+?)$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^([0-9]+?) circa$', date)
    if m: return ((int(m.group(1)) ))

    m = re.search('^Tra il ([0-9]+?) e il$', date)
    if m: return ((int(m.group(1)) ))



    m = re.search('^([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^([XIV]+?) primo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^([XIV]+?) ondo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))
    
    m = re.search('^([XIV]+?) terzo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^([XIV]+?) ultimo quarto$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^([XIV]+?) fine$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^Fine ([XIV]+?)$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^([XIV]+?) onda metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))

    m = re.search('^([XIV]+?) metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100))


    m = re.search('^([XIV]+?) prima metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * 100 + 25))

    m = re.search('^([XIV]+?) miIIennio a.C. onda metà$', date)
    if m: return ((roman.fromRoman(m.group(1)) * -1000 + 25))


    print(date)
        #print('[ERROR] unresolved')


for index, row in df_artworks.iterrows():
    #Artwork URI
    parseDate(row['Cronologia'])
    idU = row['img']
    #node
    Artwork = URIRef(PGT[idU])
    #triple
    g.add((Artwork, RDF.type, PGT.Artwork))
    g.add((Artwork, PGT['name'], Literal(row['name'], datatype=XSD.string)))    
    g.add((Artwork, PGT['type'], Literal(row['tags'], datatype=XSD.string)))  
    g.add((Artwork, PGT['author'], Literal(row['Autore'], datatype=XSD.string)))    
    g.add((Artwork, PGT['height'], Literal(row['Altezza'], datatype=XSD.float)))    
    g.add((Artwork, PGT['width'], Literal(row['Larghezza'], datatype=XSD.float)))    
    g.add((Artwork, PGT['period'], Literal(row['Cronologia'], datatype=XSD.string)))
    g.add((Artwork, PGT['placing'], Literal(row['Collocazione'], datatype=XSD.string)))    
    g.add((Artwork, PGT['conservationState'], Literal(row['Stato di conservazione'], datatype=XSD.string)))    
    g.add((Artwork, PGT['technique'], Literal(row['Materiale e Tecnica'], datatype=XSD.string)))
    g.add((Artwork, PGT['webImageAdress'], Literal(row['img'], datatype=XSD.string)))    
    g.add((Artwork, PGT['description'], Literal(row['description'], datatype=XSD.string)))    
    #g.add((Artwork, PGT['museumName'], Literal(row['museumName'], datatype=XSD.string)))

I a.C. prima metà
I a.C. onda metà
Tra il 27 a.C. e il 68
Metà I - II
Metà I - II
Ultimo quarto I a.C. - metà I
Tra il fine I a.C. e il metà I
Fine I a.C. - prima metà I
Seconda metà I - II
Tra il II a.C. e il 165 a.C.
II a.C. prima metà
I a.C. - I fine/inizio
IV a.C. metà
I a.C. - I
I a.C. - I fine/inizio
I a.C. - I
I a.C. - I
I a.C. - I fine/inizio
I a.C.-I fine/inizio
I a.C.-I fine/inizio
I a.C. - I
Metà XV metà
Seconda metà XVII
Dopo la metà del XVII
Fine XVII
Fine XVII
Seconda metà XVIII


In [98]:
g.serialize(destination=savePath+"artworks.ttl", format='turtle')

<Graph identifier=N0a77cdd20690454a84c6babd1cb4c98e (<class 'rdflib.graph.Graph'>)>

In [99]:
#MUSEUMS

for index, row in df_museums.iterrows():
    idU = row['img']
    Museum = URIRef(PGT[idU])
    g.add((Museum, RDF.type, PGT.Museum))
    g.add((Museum, PGT['description'], Literal(row['description'], datatype=XSD.string)))    
    g.add((Museum, PGT['webImageAdress'], Literal(row['img'], datatype=XSD.string)))    

In [100]:
g.serialize(destination=savePath+"museums.ttl", format='turtle')

<Graph identifier=N0a77cdd20690454a84c6babd1cb4c98e (<class 'rdflib.graph.Graph'>)>