In [1]:
#!pip install gldb[tutorial]

In [2]:
import gldb

import pandas as pd
import pathlib
import rdflib

## Data Stores

The database shall interact with "**data stores**" (here used as a more generic word for database). Through them, data can be accessed (uploaded and queried).

They can be databases for raw or metadata.

Let's implement the concrete implementations for a CSV and a in-memory-RDF database.

In [5]:
from gldb.query import Query, QueryResult
from gldb.stores import DataStore, RDFStore
import pathlib

import pandas as pd
import rdflib

#!pip install gldb[tutorial]

In [17]:
import gldb


## Data Stores

The
database
shall
interact
with "**data stores**"(
        here used as a more generic word for database).Through them, data can be accessed (uploaded and queried).

They
can
be
databases
for raw or metadata.

Let
's implement the concrete implementations for a CSV and a in-memory-RDF database.

In [8]:
from gldb.query import Query, QueryResult
from gldb.stores import DataStore, RDFStore

In [None]:
class CSVQuery(Query):

    def __init__(self, query, table):
        self._query = query
        self._table = table

    def execute(self, store: DataStore, *args, **kwargs):
        return QueryResult(query=self, data=store.tables[self._table].query(self._query))

In [9]:
class CSVDataStore(DataStore):

    def __init__(self, filenames=None):
        self._filenames = filenames or []
        self.tables = {}

    def upload_file(self, filename: pathlib.Path) -> bool:
        if filename.resolve().absolute() in self._filenames:
            return True
        self._filenames.append(filename.resolve().absolute())
        self.tables[filename.stem] = pd.read_csv(filename)
        return True

In [None]:
class InMemoryRDFDataStore(RDFStore):

    def __init__(self):
        self._filenames = []
        self._graphs = {}

    def upload_file(self, filename: pathlib.Path) -> bool:
        self._filenames.append(filename.resolve().absolute())
        return True

    @property
    def graph(self) -> rdflib.Graph:
        """Return graph for the metadata store."""
        combined_graph = rdflib.Graph()
        for filename in self._filenames:
            g = self._graphs.get(filename, None)
            if not g:
                g = rdflib.Graph()
                g.parse(filename)
                for s, p, o in g:
                    if isinstance(s, rdflib.BNode):
                        new_s = rdflib.URIRef(f"https://example.org/{s}")
                    else:
                        new_s = s
                    if isinstance(o, rdflib.BNode):
                        new_o = rdflib.URIRef(f"https://example.org/{o}")
                    else:
                        new_o = o
                    g.remove((s, p, o))
                    g.add((new_s, p, new_o))
                self._graphs[filename] = g
            combined_graph += g
        return combined_graph

## The Database instance

The core implementation concerns the implementation of `GenericLinkedDatabase`:

In [11]:
db = gldb.GenericLinkedDatabase(
    {
        "csv": CSVDataStore(),
        "rdf": InMemoryRDFDataStore()
    }
)

Popoulate the stores with data:

In [None]:
for filename in pathlib.Path("data").glob('*.jsonld'):
    db.stores.rdf.upload_file(filename)

In [12]:
for filename in pathlib.Path("data").glob('*.csv'):
    db.stores.csv.upload_file(filename)

## Query the RDF store

Every store as a property `query` which returns the query-object of the store. In case of the already implemented `RDFStore` it is the `SparqlQuery` (also implemented by `gldb`):

Let's formulate the SPARQL query string:

In [14]:
from gldb.query import SparqlQuery

In [15]:
query = SparqlQuery("""
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX dcterm: <http://purl.org/dc/terms/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT * WHERE {
    ?file a dcat:Dataset .
    ?file dcterm:creator ?person .
    ?person a foaf:Person .
}
""")

Unnamed: 0,file,person
0,file:temperature.csv,https://example.org/N769b3ec6e77a40cb9a36f4d0f...


Perform the query:

In [18]:
res = query.execute(db.stores.rdf)

In [None]:
res.data

## Query the CSV store

In [None]:
csv_query = CSVQuery("temperature > 23.0", "temperature")

In [34]:
csv_res = csv_query.execute(db.stores.csv)
csv_res.data

# Federated queries

Since raw temperature data is stored in a different database (store) than the metadata, there is the need to combine the data.

This must currently be done using custom functions that return a `FederatedQueryResult`:

In [38]:
def fetch_temperature_dataset(query, table) -> gldb.query.FederatedQueryResult:
    """custom federated query"""
    csv_query = CSVQuery(query, table)
    csv_res = csv_query.execute(db.stores.csv)

    query_all_metadata_of_temperature = f"""
    PREFIX dcat: <http://www.w3.org/ns/dcat#>
    SELECT * WHERE {{
        <file:{table}.csv> a dcat:Dataset .
        ?s ?p ?o
    }}
    """
    query = SparqlQuery(query_all_metadata_of_temperature)
    rdf_res = query.execute(db.stores.rdf)

    fed_res = gldb.query.FederatedQueryResult(
        data=csv_res.data,
        metadata=rdf_res.data
    )
    return fed_res

Unnamed: 0,s,p,o
0,http://example.org/john_doe,http://xmlns.com/foaf/0.1/name,John Doe
1,file:temperature.csv,http://www.w3.org/ns/dcat#keyword,weather data
2,file:temperature.csv,http://purl.org/dc/terms/created,2024-01-01
3,file:temperature.csv,http://www.w3.org/ns/dcat#keyword,time series
4,https://example.org/N769b3ec6e77a40cb9a36f4d0f...,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://xmlns.com/foaf/0.1/Person
5,file:temperature.csv,http://purl.org/dc/terms/description,CSV file containing temperature data over time...
6,http://example.org/jane_smith,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://xmlns.com/foaf/0.1/Person
7,file:temperature.csv,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/ns/dcat#Dataset
8,http://example.org/john_doe,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://xmlns.com/foaf/0.1/Person
9,http://example.org/jane_smith,http://xmlns.com/foaf/0.1/knows,http://example.org/john_doe


In [None]:
federated_result = fetch_temperature_dataset(query="temperature > 23.0", table="temperature")

In [None]:
federated_result.data

In [None]:
federated_result.metadata