# Running the Calcite SPARQL adapter using Apache Arrow

Inspired by https://uwekorn.com/2020/12/30/fast-jdbc-revisited.html

In [None]:
!python3 -m pip install --user pyarrow

In [2]:
import jpype.imports
import os
import sys

from pyarrow.jvm import record_batch

In [3]:
# Requires to run "mvn package" on calcite-sparql-core first
jar = "../../core/target/calcite-sparql-core-0.0.1-SNAPSHOT.jar"

# The following lines are not always needed, but most of the times they are because
# the JVM is started without the needed JAR file (see for instance
# https://github.com/baztian/jaydebeapi/issues/85)
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk/"
os.environ["CLASSPATH"] = jar

args = "-Djava.class.path=%s" % jar
jvm_path = jpype.getDefaultJVMPath()
jpype.startJVM(jvm_path, args)

In [4]:
# Java dependencies

from java.lang import Class
from java.sql import DriverManager

from org.apache.arrow.adapter.jdbc import JdbcToArrowUtils, JdbcToArrowConfigBuilder
from org.apache.arrow.memory import RootAllocator
from org.apache.arrow.vector import VectorSchemaRoot

In [None]:
# Make sure the drivers are loaded. Without these lines, getConnection may fail.

Class.forName("org.apache.calcite.avatica.remote.Driver")
Class.forName("org.apache.jena.jdbc.remote.RemoteEndpointDriver")

In [6]:
ra = RootAllocator(sys.maxsize)
calendar = JdbcToArrowUtils.getUtcCalendar()

config_builder = JdbcToArrowConfigBuilder()
config_builder.setAllocator(ra)
config_builder.setCalendar(calendar)
config_builder.setTargetBatchSize(-1)

pyarrow_jdbc_config = config_builder.build()

In [7]:
def run_query(query):
    stmt = connection.createStatement()
    result_set = stmt.executeQuery(query)
    
    root = VectorSchemaRoot.create(
        JdbcToArrowUtils.jdbcToArrowSchema(
            result_set.getMetaData(),
            pyarrow_jdbc_config
        ),
        pyarrow_jdbc_config.getAllocator()
    )
    
    try:
        JdbcToArrowUtils.jdbcToArrowVectors(result_set, root, pyarrow_jdbc_config)
        return record_batch(root).to_pandas()
    finally:
        # Ensure that we clear the JVM memory
        root.clear()
        stmt.close()

In [8]:
connection = DriverManager.getConnection("jdbc:calcite:model=../java/src/main/resources/modelClassRemote.json;lex=JAVA")

In [None]:
query = """
SELECT w3_label, w3_comment, w3_sameAs, xmlns_name, xmlns_homepage, xmlns_depiction
FROM Company
LIMIT 10
"""

run_query(query)