# Data preprocessing

In [29]:
# required libraries
import pandas as pd
import json
import os
from pathlib import Path
import re



path = str(Path(os.path.abspath(os.getcwd())))

museumsUrl = path + '/padovamuseicivici.json'
locationsUrl = path + '/wikipedia-categories.json'

In [30]:
#Read entire Json (dictionnary of multiple dictionnaries)
museums=pd.read_json(museumsUrl)

#we want to flatten the "collections" dict which contains artworks info
collections=museums.loc["collections",:]

In [31]:
artworks={}

#Extracting the Artworks dictionnary for each museum
for collec in collections:
    for key, value in collec.items():
        #print(value.get('artworks'))
        artworks.update(value.get('artworks'))
        
df_artworks=pd.DataFrame(artworks)
df_artworks=df_artworks.T
df_artworks=pd.concat([df_artworks.drop(['fields'], axis=1), df_artworks['fields'].apply(pd.Series)], axis=1)
df_artworks['Url']=df_artworks.index

#Each row is an artwork, each column is a data property
df_artworks.reset_index(drop=True, inplace=True)

#Creating the museumName column using regex
df_artworks['museumName']=df_artworks.apply(lambda row : re.search('musei/(.*)/collezioni',row['Url']),axis=1)

for index, row in df_artworks.iterrows():
    if row['museumName']:
        row['museumName']=row['museumName'].group(1)
    else:
        row['museumName']="capellascrovegni"

df_artworks.head(1)

Unnamed: 0,name,img,description,tags,Cronologia,Materiale e Tecnica,Dimensioni,Collocazione,Inventario,Stato di conservazione,...,Specifiche di reperimento,Osservazioni,Dati di scavo,Autore,Dritto,Rovescio,Notizie storico-critiche,Zecca,Url,museumName
0,Gioacchino cacciato dal Tempio,https://cappellascrovegni.padovamusei.it/sites...,"Si narra nei Vangeli apocrifi che Gioacchino, ...",[Dipinto],"XIV, inizio sec.",Affresco,Altezza: 208 cm ca.Larghezza: 220 cm ca.,"Navata, parete destra, registro superiore, I r...",S.n.,Buono,...,,,,,,,,,https://cappellascrovegni.padovamusei.it/it/co...,capellascrovegni


In [32]:
#Regex to extract different dimensions from String Dimensioni column
df_dimensions = df_artworks["Dimensioni"].str.extractall(r'([^\s"]+)\s*:\s*(\d+\.?\d*)').droplevel(1)
df_dimensions = df_dimensions.pivot(columns=0, values=1)
df_dimensions.columns.name = None
df_dimensions=df_dimensions.fillna(0)
df_dimensions[['Larghezza','ca.Larghezza']]=df_dimensions[['Larghezza','ca.Larghezza']].apply(pd.to_numeric, errors='coerce', axis=1)
df_dimensions['Larghezza']=df_dimensions['Larghezza']+df_dimensions['ca.Larghezza']
df_dimensions.drop(columns=['(gr.)', 'ca.Larghezza','cm'], inplace=True)

In [33]:
df_artworks=pd.concat([df_artworks, df_dimensions], axis=1)

In [34]:
df_museums=museums.T
df_museums=df_museums[['name','img','description']]

#Saving to csv files
df_artworks.to_csv("artworks.csv", encoding='utf-8')
df_museums.to_csv("museums.csv",encoding='utf-8')

# Creating database

In [35]:
import pandas as pd
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
from pathlib import Path
import os

path = str(Path(os.path.abspath(os.getcwd())))

PGT = Namespace("https://padovagrandtour.github.io/entitites#")


# saving folder
savePath =  path + '\\PgtTTLOutput'

g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("pgt", PGT)

In [36]:
artworks=pd.read_csv("artworks.csv",encoding="utf-8", index_col=0)
artworks.head(1)

Unnamed: 0,name,img,description,tags,Cronologia,Materiale e Tecnica,Dimensioni,Collocazione,Inventario,Stato di conservazione,...,Notizie storico-critiche,Zecca,Url,museumName,Altezza,Diametro,Larghezza,Lunghezza,Profondità,Spessore
0,Gioacchino cacciato dal Tempio,https://cappellascrovegni.padovamusei.it/sites...,"Si narra nei Vangeli apocrifi che Gioacchino, ...",['Dipinto'],"XIV, inizio sec.",Affresco,Altezza: 208 cm ca.Larghezza: 220 cm ca.,"Navata, parete destra, registro superiore, I r...",S.n.,Buono,...,,,https://cappellascrovegni.padovamusei.it/it/co...,capellascrovegni,208.0,0.0,220.0,0.0,0.0,0.0


In [37]:
#ARTWORKS

for index, row in artworks.iterrows():
    #Artwork URI
    idU = row['img']
    #node
    Artwork = URIRef(PGT[idU])
    #triple
    g.add((Artwork, RDF.type, PGT.Artwork))
    g.add((Artwork, PGT['name'], Literal(row['name'], datatype=XSD.string)))    
    g.add((Artwork, PGT['type'], Literal(row['tags'], datatype=XSD.string)))  
    g.add((Artwork, PGT['author'], Literal(row['Autore'], datatype=XSD.string)))    
    g.add((Artwork, PGT['height'], Literal(row['Altezza'], datatype=XSD.float)))    
    g.add((Artwork, PGT['width'], Literal(row['Larghezza'], datatype=XSD.float)))    
    g.add((Artwork, PGT['period'], Literal(row['Cronologia'], datatype=XSD.string)))
    g.add((Artwork, PGT['placing'], Literal(row['Collocazione'], datatype=XSD.string)))    
    g.add((Artwork, PGT['conservationState'], Literal(row['Stato di conservazione'], datatype=XSD.string)))    
    g.add((Artwork, PGT['technique'], Literal(row['Materiale e Tecnica'], datatype=XSD.string)))
    g.add((Artwork, PGT['webImageAdress'], Literal(row['img'], datatype=XSD.string)))    
    g.add((Artwork, PGT['description'], Literal(row['description'], datatype=XSD.string)))    
    #g.add((Artwork, PGT['museumName'], Literal(row['museumName'], datatype=XSD.string)))

In [38]:
g.serialize(destination=savePath+"artworks.txt", format='turtle')

<Graph identifier=Nec10ccad36ff41f1835621115f9cf094 (<class 'rdflib.graph.Graph'>)>

In [39]:
df_museums.columns

Index(['name', 'img', 'description'], dtype='object')

In [None]:
#MUSEUMS

for index, row in museums.iterrows():
    idU = row['img']
    Museum = URIRef(PGT[idU])
    g.add((Museum, RDF.type, PGT.Museum))
    g.add((Museum, PGT['description'], Literal(row['description'], datatype=XSD.string)))    
    g.add((Museum, PGT['webImageAdress'], Literal(row['img'], datatype=XSD.string)))    


In [None]:
g.serialize(destination=savePath+"museums.txt", format='turtle')

<Graph identifier=Ne953619ce09c4a5aa293e949601d2f96 (<class 'rdflib.graph.Graph'>)>