[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/p-lod/p-lod-cookbook/blob/main/p-lod-local-rdf.ipynb)

In [None]:
# rdflib isn't in Colab's default set of modules. The try block will install if import fails.
import sys
try:
    import rdflib
except ImportError:
    !{sys.executable} -m pip install -q rdflib
    import rdflib

import gzip
import matplotlib
import pandas as pd
import requests

from string import Template

In [None]:
# URL of the gzip'd nt file. That is in a separate github repo set up just for availablilty of large files.
url = "https://github.com/p-lod/downloads/releases/download/latest/p-lod-rdf-graph.nt.gz"


In [None]:
# This cell usually takes between 25 and 90 seconds to run locally. Longer in Google Colab. Seems to all depend on
#  external factors such as any at-the-moment caching by the Github CDN and overall connection speed. Various
#  setups were tested. Streamed loading of a gzip-compressed .nt file seemed fastest. GitHub releases wasn't the absolute 
# fastest CDN but is straightforward and not much slower than Cloudflare R2.

with requests.get(url, stream=True, allow_redirects=True) as r:
    r.raise_for_status()
    with gzip.GzipFile(fileobj=r.raw) as f:
        g = rdflib.Graph().parse(f, format='nt')

print(f"Graph loaded from URL contains {len(g)} triples.")

In [None]:
# add urn:p-lod:id: as the default prefix in g
g.bind("", "urn:p-lod:id:")

In [None]:
# should run pretty quickly. under a second.

identifier = "olympian_deity"

q = Template("""
SELECT ?depicted (COUNT(?depicted) as ?count) WHERE {
    ?component :depicts ?depicted .
    ?depicted :broader+ :$identifier .
} GROUP BY ?depicted
""").substitute(identifier = identifier)


results = g.query(q)

df = pd.DataFrame(results, columns=[str(v) for v in results.vars])
df['count'] = pd.to_numeric(df['count'])

df.head()


In [None]:
# simple visualization

df.sort_values(by="count", ascending = False).plot.bar(x='depicted')

In [None]:
# This is a more complex query that assocaites space characterizations - basically room function -
# with depicted concepts - basically motifs such "garland" or "ariadne".
# Usually runs in under 5 seconds on an M4 MacBook with 16 gig of RAM. Longer on Colab.

results = g.query("""SELECT DISTINCT ?space ?characterization ?feature ?concept WHERE {
  ?component :depicts ?concept .
  ?concept a :concept .

  ?component :is-part-of+/:created-on-surface-of ?feature .
  ?feature :spatially-within ?space .
  OPTIONAL { ?space :has-space-characterization ?characterization}

} ORDER BY ?space ?feature ?concept """)

sfc_df = pd.DataFrame(results, columns=[str(v) for v in results.vars])


display(sfc_df)

In [None]:
# utility cell to save as nt.gz. only one half of process. ignore for now. honestly, it doesnt
# really belong here. ignore.

with gzip.open('p-lod-rdf-graph.nt.gz', 'wb') as f:
    g.serialize(destination=f, format='nt')
print("RDF graph saved as 'p-lod-rdf-graph.nt.gz'")