# Tutorial

In this tutorial we creating a database using two custum databases:
- raw data store: InMemoryCSVDB
- rdf data store: InMemoryRDFDB 

In [None]:
#!pip install gldb[tutorial]

In [None]:
from gldb import RawDataStore, RDFStore, GenericLinkedDatabase
from gldb.stores import DataStoreManager
from gldb.query import Query
from gldb.query.rdfstorequery import SparqlQuery

from typing import Union, List
import pandas as pd
import pathlib
import rdflib

## Data Stores

The concept defines **data stores** which are interfaces to databases. They can be RDF databases or traditional databases like SQL or noSQL databases.

Let's first implement a concrete implementation for both types:

In [None]:
class CSVDatabase(RawDataStore):

    def __init__(self):
        self._filenames = []
        self.tables = {}
        self._expected_file_extensions = {".csv", }

    @property
    def expected_file_extensions(self):
        return self._expected_file_extensions

    def upload_file(self, filename: pathlib.Path) -> bool:
        if filename.resolve().absolute() in self._filenames:
            return True
        self._filenames.append(filename.resolve().absolute())
        self.tables[filename.stem] = pd.read_csv(filename)
        return True

    def execute_query(self, query: Query):
        return query.execute(self.tables)


In [None]:
class InMemoryRDFDatabase(RDFStore):

    def __init__(self):
        self._filenames = []
        self._graphs = {}
        self._expected_file_extensions = {".ttl", ".rdf", ".jsonld"}

    @property
    def expected_file_extensions(self):
        return self._expected_file_extensions

    def execute_query(self, query: SparqlQuery):
        return query.execute(self.graph)

    def upload_file(self, filename: pathlib.Path) -> bool:
        self._filenames.append(filename.resolve().absolute())
        return True

    @property
    def graph(self) -> rdflib.Graph:
        combined_graph = rdflib.Graph()
        for filename in self._filenames:
            g = self._graphs.get(filename, None)
            if not g:
                g = rdflib.Graph()
                g.parse(filename)
                for s, p, o in g:
                    if isinstance(s, rdflib.BNode):
                        new_s = rdflib.URIRef(f"https://example.org/{s}")
                    else:
                        new_s = s
                    if isinstance(o, rdflib.BNode):
                        new_o = rdflib.URIRef(f"https://example.org/{o}")
                    else:
                        new_o = o
                    g.remove((s, p, o))
                    g.add((new_s, p, new_o))
                self._graphs[filename] = g
            combined_graph += g
        return combined_graph

## Core

The core implementation concerns the implementation of `GenericLinkedDatabase`:

In [None]:
class GenericLinkedDatabaseImpl(GenericLinkedDatabase):

    def __init__(self):
        _store_manager = DataStoreManager()
        _store_manager.add_store("rdf_database", InMemoryRDFDatabase())
        _store_manager.add_store("csv_database", CSVDatabase())
        self._store_manager = _store_manager

    @property
    def store_manager(self) -> DataStoreManager:
        return self._store_manager
        
    def linked_upload(self, filename: Union[str, pathlib.Path]):
        raise NotImplemented("linked_upload not implemented")

## Application

Now, let's instantiate the database and upload the data:

In [None]:
db = GenericLinkedDatabaseImpl()

**Upload the semantic metadata**

In [None]:
for filename in pathlib.Path("data").glob('*.jsonld'):
    db["rdf_database"].upload_file(filename)

**Upload the raw data**

In [None]:
for filename in pathlib.Path("data").glob('*.csv'):
    db["csv_database"].upload_file(filename)

### Query metadata

Get all persons from the metadata:

In [None]:
select_all_persons = SparqlQuery("""
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT * WHERE {
    ?person a foaf:Person .
}
""")

In [None]:
res = db.execute_query("rdf_database", select_all_persons)

In [None]:
res.bindings

## Query data

To query the CSV data, we first need to implement a query class `CSVQuery`

In [None]:
class CSVQuery(Query):

    def __init__(self, table_name:str, query:str):
        self.table_name = table_name
        self.query = query

    def execute(self, tables: List[pd.DataFrame]):
        return tables[self.table_name].query(self.query)

Find all values of column "temperature" above 23 in table "temperature"

In [None]:
temperatures_above_23 = CSVQuery(table_name="temperature",  query='temperature > 23.0')

In [None]:
db.execute_query("csv_database", temperatures_above_23)