# SPARQL queries of CWLProv provenance files
This document provides an overview of different SPARQL queries, together with their (expected) result.

## Import modules & queries

In [1]:
import os
import rdflib
from pathlib import Path
from rdflib.plugins.sparql import prepareQuery
from rdflib.namespace import Namespace
import pandas as pd

In [2]:
cwd = Path(os.getcwd())
queries_dir = cwd.parent / 'queries'

In [3]:
SCHEMA = Namespace("http://schema.org/")
WFDESC = Namespace("http://purl.org/wf4ever/wfdesc#")

## Functions

In [4]:
def run_query(rdf_file, query_file, namespaces):
    """
    rdf_file = RDF file; query_file = path to sparql query file.
    """
    g = rdflib.Graph()
    g.parse(rdf_file)
    with open(query_file, 'r')  as f:
        query_string = f.read()
        query = prepareQuery(
            queryString = query_string,
            initNs = namespaces,
        )

    print(f"SPARQL QUERY IS:\n{query}")
    
    qres = g.query(query)
    
    results = pd.DataFrame(qres.bindings).map(str).rename(columns=str)
    return results

In [5]:
def extract_wf_namespace(rdf_file):
    """
    Function which extracts namespace from CWLProv RDF provenance graph.
    """
    g = rdflib.Graph()
    g.parse(rdf_file)
    namespaces = list(g.namespaces())
    wf_namespace = ""
    for ns in namespaces:
        (prefix, namespace) = ns
        if prefix == "wf":
            wf_namespace = namespace

    return wf_namespace

## SPARQL queries

Return the doc, label, and intent fields of the main workflow.

In [6]:
provenance_file = cwd.parent / "data/scenario1/ro/metadata/provenance/primary.cwlprov.ttl" 

wf_namespace = extract_wf_namespace(provenance_file)

namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_wf_doc_query = queries_dir / "wf_metadata_fields.sparql"
response = run_query(provenance_file, extract_wf_doc_query, namespaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x118c048f0>
      doc                                  intent     label  \
0  WF_doc  http://edamontology.org/operation_0004  WF_label   

                                                  wf  
0  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  


Return doc and label fields of every workflow step.

In [13]:
namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_wf_step_doc_query = queries_dir / "wf_step_metadata_fields.sparql"
response = run_query(provenance_file, extract_wf_step_doc_query, namespaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x11926f950>
           doc          label  \
0  wf_step_doc  wf_step_label   

                                             main_wf  \
0  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...   

                                        main_wf_step  
0  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  


Return doc, label, and intent fields of every command-line tool/nested workflow that is run by each of the steps.

In [7]:
namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_clt_doc_query = queries_dir / "clt_nested_wf_metadata_fields.sparql"
response = run_query(provenance_file, extract_clt_doc_query, namespaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x107fab410>
       doc                                  intent      label  \
0  CLT_doc  http://edamontology.org/operation_0004  CLT_label   

                                             main_wf  
0  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  


List doc, label, format fields of all input parameters of main workflow.

In [8]:
namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_wf_inputs_query = queries_dir / "wf_input_params_metadata_fields.sparql"
response = run_query(provenance_file, extract_wf_inputs_query, namespaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x11926ed50>
                  doc                                             format  \
0  wf_input_param_doc  https://www.iana.org/assignments/media-types/t...   

                  label                                                 wf  
0  wf_input_param_label  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  


List doc, label, format fields of all output parameters of main workflow.

In [9]:
namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_wf_outputs_query = queries_dir / "wf_output_params_metadata_fields.sparql"
response = run_query(provenance_file, extract_wf_outputs_query, namespaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x10a14c5c0>
                   doc                                             format  \
0  wf_output_param_doc  https://www.iana.org/assignments/media-types/t...   

                   label                                                 wf  
0  wf_output_param_label  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  


List doc, label, format fields of all input parameters of nested workflows/commandlinetools.

In [10]:
namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_wf_inputs_query = queries_dir / "clt_nested_wf_input_params_metadata_fields.sparql" 
response = run_query(provenance_file, extract_wf_inputs_query, namespaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x118c050d0>
                                                 clt  \
0  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...   

                                             main_wf            param_doc  \
0  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  CLT_input_param_doc   

                                        param_format            param_label  
0  https://www.iana.org/assignments/media-types/t...  CLT_input_param_label  


List doc, label, format fields of all output parameters of nested workflows/commandlinetools.

In [11]:
namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_wf_outputs_query = queries_dir / "clt_nested_wf_output_params_metadata_fields.sparql" 
response = run_query(provenance_file, extract_wf_outputs_query, nam espaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x1192e2690>
                                                 clt  \
0  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...   

                                             main_wf             param_doc  \
0  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  CLT_output_param_doc   

                                        param_format             param_label  
0  https://www.iana.org/assignments/media-types/t...  CLT_output_param_label  
