Permalink
Browse files

implemented config-based publishing

  • Loading branch information...
1 parent 6bb192e commit c1d0cf1c36bbc70b8db87c14fc5156075191a06b mhausenblas committed Aug 1, 2011
Showing with 126 additions and 22 deletions.
  1. +40 −1 README.md
  2. +22 −0 web.instata.config
  3. +64 −21 web.instata.py
View
@@ -17,7 +17,7 @@ Turn your plain old tabular data (POTD) into Web data with web.instata: it takes
## Usage
-### Publishing
+### Simple publishing
In order to publish a HTML+microdata document from a CSV file:
@@ -65,6 +65,45 @@ The generated HTML document, [potd_0.html](https://raw.github.com/mhausenblas/we
</tbody>
</table>
+### Configuration-based publishing
+
+A more flexible but also slightly more complex case is that of using a web.instata configuration file to specify input and output as well as schema matching options. The syntax of the web.instata configuration file is [Turtle](http://www.w3.org/TeamSubmission/turtle/).
+
+In order to publish a HTML+microdata document from a CSV file using a configuration file:
+
+ python web.instata.py -c {path to configuration file}
+
+Example:
+
+ python web.instata.py -c web.instata.config
+
+... where a configuration file looks as follows:
+
+ @prefix dc: <http://purl.org/dc/terms/> .
+ @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+ @prefix c: <#> .
+
+ c:default-config
+ # publishing options
+ c:csv_input "test/potd_0.csv" ;
+ c:output_base_uri <http://example.org/instata/potd_0> ;
+ c:schema_matching "dbpedia-2011-07-31.rdf" ;
+
+ # directory and file options
+ c:templates_dir "templates/" ;
+ c:mappings_dir "mappings/" ;
+ c:output_dir "output/" ;
+ c:base_template "base.tpl" ;
+ c:base_style_file "web.instata-style.css" ;
+
+ # metadata about the config file
+ dc:title "The default configuration for web.instata" ;
+ dc:modified "2011-08-01"^^xsd:date ;
+ dc:creator <http://sw-app.org/mic.xhtml#i> ;
+ .
+
+Note that in the configuration file you can specify one or more schema matchings (via `c:schema_matching`) as well as customise the output (`c:base_template` as well as `c:base_style_file`). The last block (metadata) is for completeness purposes and currently not used by web.instata - you may remove it if you want.
+
### Data dump
In order to get a RDF/Turtle data dump from a CSV file:
View
@@ -0,0 +1,22 @@
+@prefix dc: <http://purl.org/dc/terms/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix c: <#> .
+
+c:default-config
+ # publishing options
+ c:csv_input "test/potd_0.csv" ;
+ c:output_base_uri <http://example.org/instata/potd_0> ;
+ c:schema_matching "dbpedia-2011-07-31.rdf" ;
+
+ # directory and file options
+ c:templates_dir "templates/" ;
+ c:mappings_dir "mappings/" ;
+ c:output_dir "output/" ;
+ c:base_template "base.tpl" ;
+ c:base_style_file "web.instata-style.css" ;
+
+ # metadata about the config file
+ dc:title "The default configuration for web.instata" ;
+ dc:modified "2011-08-01"^^xsd:date ;
+ dc:creator <http://sw-app.org/mic.xhtml#i> ;
+.
View
@@ -35,7 +35,7 @@ class InstantWebDataPublisher(object):
OUTPUT_DIR = 'output/'
DBPEDIA2SCHEMA = 'dbpedia-2011-07-31.rdf'
- BASE_TEMPLATE = TEMPLATES_DIR + 'base.tpl'
+ BASE_TEMPLATE = 'base.tpl'
BASE_STYLE_FILE = 'web.instata-style.css'
JQUERY_UI_CSS = 'jquery-ui-1.8.4.custom.css'
DTABLE_CSS = 'd_table.css'
@@ -44,12 +44,18 @@ class InstantWebDataPublisher(object):
JQUERY_DTABLE = 'jquery.dataTables.min.js'
# internal stuff
- NAMESPACES = { 'schema' : Namespace('http://schema.org/'),
+ NAMESPACES = { 'c' : Namespace('#'),
+ 'schema' : Namespace('http://schema.org/'),
'scsv' : Namespace('http://purl.org/NET/schema-org-csv#'),
'dc' : Namespace('http://purl.org/dc/terms/'),
'owl' : Namespace('http://www.w3.org/2002/07/owl#'),
'rdfs' : Namespace('http://www.w3.org/2000/01/rdf-schema#')
}
+
+ # config query:
+ CONFIG_QUERY = """SELECT ?config ?key ?value WHERE { ?config ?key ?value .}"""
+
+
# content creation queries:
HEADER_QUERY = """SELECT ?cell ?colTitle WHERE {
@@ -77,10 +83,40 @@ class InstantWebDataPublisher(object):
def __init__(self):
self.g = None
+ self.config = None
self.doc_url = ""
self.base_uri = ""
+ self.schema_matching = [InstantWebDataPublisher.DBPEDIA2SCHEMA]
+ self.matches = {}
if not os.path.exists(InstantWebDataPublisher.OUTPUT_DIR): # make sure output dir exists
os.makedirs(InstantWebDataPublisher.OUTPUT_DIR)
+
+ def load_config(self, config_file_name):
+ self.config = ConjunctiveGraph("IOMemory")
+ self.config.parse(location=config_file_name, format="n3")
+ res = self.config.query(InstantWebDataPublisher.CONFIG_QUERY, initNs=InstantWebDataPublisher.NAMESPACES)
+ self.schema_matching = [] # make sure to reset the default (DBpedia if config is evaluated)
+ # ?config ?key ?value
+ for r in res:
+ key = str(r[1])
+ val = str(r[2])
+ if 'csv_input' in key: self.doc_url = val
+ if 'output_base_uri' in key: self.base_uri = val
+ if 'schema_matching' in key: self.schema_matching.append(val)
+ if 'templates_dir' in key: InstantWebDataPublisher.TEMPLATES_DIR = val
+ if 'mappings_dir' in key: InstantWebDataPublisher.MAPPINGS_DIR = val
+ if 'output_dir' in key: InstantWebDataPublisher.OUTPUT_DIR = val
+ if 'base_template' in key: InstantWebDataPublisher.BASE_TEMPLATE = val
+ if 'base_style_file' in key: InstantWebDataPublisher.BASE_STYLE_FILE = val
+ if InstantWebDataPublisher.DEBUG:
+ print('csv_input = %s' %self.doc_url)
+ print('output_base_uri = %s' %self.base_uri)
+ print('schema_matching = %s' %self.schema_matching)
+ print('templates_dir = %s' %InstantWebDataPublisher.TEMPLATES_DIR)
+ print('mappings_dir = %s' %InstantWebDataPublisher.MAPPINGS_DIR)
+ print('output_dir = %s' %InstantWebDataPublisher.OUTPUT_DIR)
+ print('base_template = %s' %InstantWebDataPublisher.BASE_TEMPLATE)
+ print('base_style_file = %s' %InstantWebDataPublisher.BASE_STYLE_FILE)
def parse(self, doc_url, base_uri):
self.doc_url = doc_url
@@ -108,25 +144,25 @@ def parse(self, doc_url, base_uri):
self.terms.append(r[2])
def load_mappings(self):
- print('[web.instata] loading DBpedia2Schema.org mapping ...')
- self.dbpedia2schema = ConjunctiveGraph("IOMemory")
- self.dbpedia2schema.parse(location=InstantWebDataPublisher.MAPPINGS_DIR + InstantWebDataPublisher.DBPEDIA2SCHEMA)
- print('[web.instata] got DBpedia2Schema.org mapping!')
+ for m in self.schema_matching: # defaults to DBpedia; can be overwritten by c:schema_matching in the config file
+ print('[web.instata] loading %s mapping ...' %(InstantWebDataPublisher.MAPPINGS_DIR + m))
+ self.dbpedia2schema = ConjunctiveGraph("IOMemory")
+ self.dbpedia2schema.parse(location=InstantWebDataPublisher.MAPPINGS_DIR + m)
+ print('[web.instata] got %s mapping!' %(InstantWebDataPublisher.MAPPINGS_DIR + m))
- # going through the cell types from the parse and render steps to find matching terms in DBpedia:
- self.matches = {}
- for t in self.terms:
- q = (InstantWebDataPublisher.MATCHED_TERMS_QUERY %(str(t)))
- print('[web.instata] trying to find a match for %s' %(str(t)))
- res = self.dbpedia2schema.query(q, initNs=InstantWebDataPublisher.NAMESPACES)
- # ?match ?prop
- for r in res:
- if InstantWebDataPublisher.DEBUG: print('[web.instata:DEBUG] row: %s' %(r))
- self.matches[str(t)] = (str(r[1]), str(r[0]))
- print('[web.instata] match(es) found: %s' %(self.matches))
+ # going through the cell types from the parse and render steps to find matching terms in DBpedia:
+ for t in self.terms:
+ q = (InstantWebDataPublisher.MATCHED_TERMS_QUERY %(str(t)))
+ print('[web.instata] trying to find a match for %s' %(str(t)))
+ res = self.dbpedia2schema.query(q, initNs=InstantWebDataPublisher.NAMESPACES)
+ # ?match ?prop
+ for r in res:
+ if InstantWebDataPublisher.DEBUG: print('[web.instata:DEBUG] row: %s' %(r))
+ self.matches[str(t)] = (str(r[1]), str(r[0]))
+ print('[web.instata] match(es) found: %s' %(self.matches))
def render(self):
- tpl = SimpleTemplate(name=InstantWebDataPublisher.BASE_TEMPLATE)
+ tpl = SimpleTemplate(name=InstantWebDataPublisher.TEMPLATES_DIR + InstantWebDataPublisher.BASE_TEMPLATE)
wi_last_update = datetime.datetime.utcnow().replace(microsecond = 0)
return tpl.render(ds_name=self.doc_url.split('/')[-1].split('.')[0],
theader=sorted(self.table_header, key=lambda cell: cell[0]),
@@ -137,9 +173,10 @@ def render(self):
def instata(self, doc_url, base_uri):
# get the data and render it
self.parse(doc_url, base_uri)
- self.load_mappings() # try to find matches for Schema.org terms in the mapping files
+ if len(self.schema_matching) > 0 : # schema matching is enabled
+ self.load_mappings() # try to find matches for Schema.org terms in the mapping files
render_result = self.render()
-
+
# output the result
result_file_name = InstantWebDataPublisher.OUTPUT_DIR + doc_url.split('/')[-1].split('.')[0] + ".html"
result_file = open(result_file_name, 'w')
@@ -173,11 +210,17 @@ def usage():
iwdp = InstantWebDataPublisher()
try:
- opts, args = getopt.getopt(sys.argv[1:], "hpd", ["help", "publish", "dump"])
+ opts, args = getopt.getopt(sys.argv[1:], "hcpd", ["help", "config-publish", "publish", "dump"])
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
+ elif opt in ("-c", "--config-publish"):
+ config_file_name = args[0]
+ print("[web.instata] processing config file [%s]" %(config_file_name))
+ iwdp.load_config(config_file_name)
+ r = iwdp.instata(iwdp.doc_url, iwdp.base_uri)
+ print("[web.instata] result is now available at [%s]" %(r))
elif opt in ("-p", "--publish"):
(doc_url, base_uri) = (args[0], args[1])
print("[web.instata] processing [%s] with base URI [%s] " %(doc_url, base_uri))

0 comments on commit c1d0cf1

Please sign in to comment.