Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
274 lines (238 sloc) 11.2 KB
"""
Takes CSV as input and generates a HTML document with the data items marked up with Schema.org terms.
@author: Michael Hausenblas, http://sw-app.org/mic.xhtml#i
@since: 2011-07-30
@status: init
"""
import os
import sys
sys.path.insert(0, os.getcwd() + '/lib/rdflib')
sys.path.insert(0, os.getcwd() + '/lib/rdfextras')
sys.path.insert(0, os.getcwd() + '/lib')
try:
import json
except ImportError:
import simplejson as json
import getopt
import StringIO
import shutil
import datetime
import rdflib
import rdfextras
from rdflib import *
from rdflib.plugin import PluginException
import rdflib_schemaorg_csv
from bottle import SimpleTemplate
rdflib.plugin.register('sparql', rdflib.query.Processor, 'sparql.processor', 'Processor')
rdflib.plugin.register('sparql', rdflib.query.Result, 'sparql.query', 'SPARQLQueryResult')
class InstantWebDataPublisher(object):
DEBUG = False
# some config stuff we gonna need:
TEMPLATES_DIR = 'templates/'
MAPPINGS_DIR = 'mappings/'
OUTPUT_DIR = 'output/'
DBPEDIA2SCHEMA = 'dbpedia-2011-07-31.rdf'
BASE_TEMPLATE = 'base.tpl'
BASE_STYLE_FILE = 'web.instata-style.css'
JQUERY_UI_CSS = 'jquery-ui-1.8.4.custom.css'
DTABLE_CSS = 'd_table.css'
DTABLE_JUI_CSS = 'd_table_jui.css'
JQUERY = 'jquery.js'
JQUERY_DTABLE = 'jquery.dataTables.min.js'
# internal stuff
NAMESPACES = { 'c' : Namespace('#'),
'schema' : Namespace('http://schema.org/'),
'scsv' : Namespace('http://purl.org/NET/schema-org-csv#'),
'dc' : Namespace('http://purl.org/dc/terms/'),
'owl' : Namespace('http://www.w3.org/2002/07/owl#'),
'rdfs' : Namespace('http://www.w3.org/2000/01/rdf-schema#')
}
# config query:
CONFIG_QUERY = """SELECT ?config ?key ?value WHERE { ?config ?key ?value .}"""
# content creation queries:
HEADER_QUERY = """SELECT ?cell ?colTitle WHERE {
?table a scsv:Table ;
scsv:row ?row .
?row a scsv:HeaderRow ;
scsv:cell ?cell .
?cell dc:title ?colTitle .
}"""
ROWS_QUERY = """SELECT ?row ?cell ?cellType ?val WHERE {
?table a scsv:Table ;
scsv:row ?row .
?row a scsv:Row ;
scsv:cell ?cell .
?cell a ?cellType ;
rdf:value ?val .
}"""
MATCHED_TERMS_QUERY = """SELECT ?match ?prop WHERE {
?match ?prop <%s> .
}"""
def __init__(self):
self.g = None
self.config = None
self.doc_url = ""
self.base_uri = ""
self.schema_matching = [InstantWebDataPublisher.DBPEDIA2SCHEMA]
self.matches = {}
if not os.path.exists(InstantWebDataPublisher.OUTPUT_DIR): # make sure output dir exists
os.makedirs(InstantWebDataPublisher.OUTPUT_DIR)
def load_config(self, config_file_name):
self.config = ConjunctiveGraph("IOMemory")
self.config.parse(location=config_file_name, format="n3")
res = self.config.query(InstantWebDataPublisher.CONFIG_QUERY, initNs=InstantWebDataPublisher.NAMESPACES)
self.schema_matching = [] # make sure to reset the default (DBpedia if config is evaluated)
# ?config ?key ?value
for r in res:
key = str(r[1])
val = str(r[2])
if 'csv_input' in key: self.doc_url = val
if 'output_base_uri' in key: self.base_uri = val
if 'schema_matching' in key: self.schema_matching.append(val)
if 'templates_dir' in key: InstantWebDataPublisher.TEMPLATES_DIR = val
if 'mappings_dir' in key: InstantWebDataPublisher.MAPPINGS_DIR = val
if 'output_dir' in key: InstantWebDataPublisher.OUTPUT_DIR = val
if 'base_template' in key: InstantWebDataPublisher.BASE_TEMPLATE = val
if 'base_style_file' in key: InstantWebDataPublisher.BASE_STYLE_FILE = val
if InstantWebDataPublisher.DEBUG:
print('csv_input = %s' %self.doc_url)
print('output_base_uri = %s' %self.base_uri)
print('schema_matching = %s' %self.schema_matching)
print('templates_dir = %s' %InstantWebDataPublisher.TEMPLATES_DIR)
print('mappings_dir = %s' %InstantWebDataPublisher.MAPPINGS_DIR)
print('output_dir = %s' %InstantWebDataPublisher.OUTPUT_DIR)
print('base_template = %s' %InstantWebDataPublisher.BASE_TEMPLATE)
print('base_style_file = %s' %InstantWebDataPublisher.BASE_STYLE_FILE)
def parse(self, doc_url, base_uri):
self.doc_url = doc_url
self.base_uri = base_uri
self.g = ConjunctiveGraph("IOMemory")
self.g.parse(location=doc_url, format="schemaorg_csv", csv_file_URI=base_uri)
# construct the table header and body out of the input data
if InstantWebDataPublisher.DEBUG: print('[web.instata:DEBUG] querying input data ...')
res = self.g.query(InstantWebDataPublisher.HEADER_QUERY, initNs=InstantWebDataPublisher.NAMESPACES)
self.table_header = []
# ?cell ?colTitle
for r in res:
if InstantWebDataPublisher.DEBUG: print('[web.instata:DEBUG] row: %s' %(r))
self.table_header.append((r[0], r[1]))
self.table_rows = []
self.terms = []
res = self.g.query(InstantWebDataPublisher.ROWS_QUERY, initNs=InstantWebDataPublisher.NAMESPACES)
# ?row ?cell ?cellType ?val
for r in res:
if InstantWebDataPublisher.DEBUG: print('[web.instata:DEBUG] row: %s' %(r))
self.table_rows.append((r[0], r[1], r[2], r[3]))
if r[2] not in self.terms:
self.terms.append(r[2])
def load_mappings(self):
for m in self.schema_matching: # defaults to DBpedia; can be overwritten by c:schema_matching in the config file
print('[web.instata] loading %s mapping ...' %(InstantWebDataPublisher.MAPPINGS_DIR + m))
self.dbpedia2schema = ConjunctiveGraph("IOMemory")
self.dbpedia2schema.parse(location=InstantWebDataPublisher.MAPPINGS_DIR + m)
print('[web.instata] got %s mapping!' %(InstantWebDataPublisher.MAPPINGS_DIR + m))
# going through the cell types from the parse and render steps to find matching terms in DBpedia:
for t in self.terms:
q = (InstantWebDataPublisher.MATCHED_TERMS_QUERY %(str(t)))
print('[web.instata] trying to find a match for %s' %(str(t)))
res = self.dbpedia2schema.query(q, initNs=InstantWebDataPublisher.NAMESPACES)
# ?match ?prop
for r in res:
if InstantWebDataPublisher.DEBUG: print('[web.instata:DEBUG] row: %s' %(r))
self.matches[str(t)] = (str(r[1]), str(r[0]))
print('[web.instata] match(es) found: %s' %(self.matches))
def render(self):
tpl = SimpleTemplate(name=InstantWebDataPublisher.TEMPLATES_DIR + InstantWebDataPublisher.BASE_TEMPLATE)
wi_last_update = datetime.datetime.utcnow().replace(microsecond = 0)
return tpl.render(ds_name=self.doc_url.split('/')[-1].split('.')[0],
theader=sorted(self.table_header, key=lambda cell: cell[0]),
trows=sorted(self.table_rows, key=lambda cell: cell[1]),
aliases=self.matches,
wi_last_update=str(wi_last_update) + ' (UTC)')
def instata(self, doc_url, base_uri):
# get the data and render it
self.parse(doc_url, base_uri)
if len(self.schema_matching) > 0 : # schema matching is enabled
self.load_mappings() # try to find matches for Schema.org terms in the mapping files
render_result = self.render()
# output the result
result_file_name = InstantWebDataPublisher.OUTPUT_DIR + doc_url.split('/')[-1].split('.')[0] + ".html"
result_file = open(result_file_name, 'w')
result_file.write(render_result)
result_file.close()
# copy style files (CSS and JS)
shutil.copy2(InstantWebDataPublisher.TEMPLATES_DIR + InstantWebDataPublisher.BASE_STYLE_FILE, InstantWebDataPublisher.OUTPUT_DIR + InstantWebDataPublisher.BASE_STYLE_FILE)
shutil.copy2(InstantWebDataPublisher.TEMPLATES_DIR + InstantWebDataPublisher.JQUERY_UI_CSS, InstantWebDataPublisher.OUTPUT_DIR + InstantWebDataPublisher.JQUERY_UI_CSS)
shutil.copy2(InstantWebDataPublisher.TEMPLATES_DIR + InstantWebDataPublisher.DTABLE_CSS, InstantWebDataPublisher.OUTPUT_DIR + InstantWebDataPublisher.DTABLE_CSS)
shutil.copy2(InstantWebDataPublisher.TEMPLATES_DIR + InstantWebDataPublisher.DTABLE_JUI_CSS, InstantWebDataPublisher.OUTPUT_DIR + InstantWebDataPublisher.DTABLE_JUI_CSS)
shutil.copy2(InstantWebDataPublisher.TEMPLATES_DIR + InstantWebDataPublisher.JQUERY, InstantWebDataPublisher.OUTPUT_DIR + InstantWebDataPublisher.JQUERY)
shutil.copy2(InstantWebDataPublisher.TEMPLATES_DIR + InstantWebDataPublisher.JQUERY_DTABLE, InstantWebDataPublisher.OUTPUT_DIR + InstantWebDataPublisher.JQUERY_DTABLE)
return result_file_name
def dump_data(self, format='turtle'):
if self.g:
self.g.bind('schema', InstantWebDataPublisher.NAMESPACES['schema'], True)
self.g.bind('scsv', InstantWebDataPublisher.NAMESPACES['scsv'], True)
self.g.bind('dc', InstantWebDataPublisher.NAMESPACES['dc'], True)
return self.g.serialize(format=format)
else:
return None
def validate(self, doc_url, base_uri):
# make sure we have the column heads available:
self.parse(doc_url, base_uri)
# load Schema.org terms (datatypes, properties and types):
output_json = json.load(open('mappings/schema-org-all.json'))
detailed_results = {}
columns = sorted(self.table_header, key=lambda cell: cell[0])
for col in columns:
test_column = str(col[1])
detailed_results[test_column] = False
for toplevel, subdict in output_json.iteritems():
if InstantWebDataPublisher.DEBUG: print('[web.instata:DEBUG] scanning all %s ...' %(toplevel))
if test_column in subdict.keys():
if InstantWebDataPublisher.DEBUG: print('[web.instata:DEBUG] %s is a valid Schema.org term.' %(test_column))
detailed_results[test_column] = True
else:
if InstantWebDataPublisher.DEBUG: print('[web.instata:DEBUG] %s is not a valid Schema.org term.' %(test_column))
summary = True
for v in detailed_results.values():
if v == False: summary = False
return (summary, detailed_results)
def usage():
print("Usage: python web.instata.py -p {path to CSV file} {base URI for publishing}")
print("Example: python web.instata.py -p test/potd_0.csv http://example.org/instata/potd_0")
if __name__ == "__main__":
iwdp = InstantWebDataPublisher()
try:
opts, args = getopt.getopt(sys.argv[1:], "hcpdv", ["help", "config-publish", "publish", "dump", "validate"])
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-c", "--config-publish"):
config_file_name = args[0]
print("[web.instata] processing config file [%s]" %(config_file_name))
iwdp.load_config(config_file_name)
r = iwdp.instata(iwdp.doc_url, iwdp.base_uri)
print("[web.instata] result is now available at [%s]" %(r))
elif opt in ("-p", "--publish"):
(doc_url, base_uri) = (args[0], args[1])
print("[web.instata] processing [%s] with base URI [%s] " %(doc_url, base_uri))
r = iwdp.instata(doc_url, base_uri)
print("[web.instata] result is now available at [%s]" %(r))
elif opt in ("-d", "--dump"):
(doc_url, base_uri) = (args[0], args[1])
print("[web.instata] processing [%s] with base URI [%s] " %(doc_url, base_uri))
iwdp.parse(doc_url, base_uri)
print(iwdp.dump_data())
elif opt in ("-v", "--validate"):
(doc_url, base_uri) = (args[0], args[1])
print("[web.instata] validating schema ...")
(summary, detailed_results) = iwdp.validate(doc_url, base_uri)
if summary: print("[web.instata] all column headings in the input file %s seem to be valid Schema.org terms :)" %doc_url)
else:
print("[web.instata] during the validation at least one column heading in input file %s seems not to be a valid Schema.org term :(" %doc_url)
print(detailed_results)
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)