## Practice OBDA for Parquet files 

---
This Jupiter Notebook creates a parquet data base, creates and ontology and then constructs an OBDA map to SPARQL query the Parquet database
---

Create Parquet schema and instances and write parquet file

In [1]:
# # import Parquet libraries
# import pandas as pd
# import pyarrow as pa
# import pyarrow.parquet as pq

# # Define a schema and print
# schema = pa.schema([
#     ('Name', pa.string()),
#     ('Age', pa.int32()),
#     ('City', pa.string())
# ])
# # print(schema)

# # Sample data
# data = {
#     'Name': ['Alice', 'Bob', 'Charlie', 'David'],
#     'Age': [25, 30, 35,40],
#     'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
# }

# # Create a Pandas DataFrame
# df = pd.DataFrame(data)

# # Convert the Pandas DataFrame to a PyArrow Table
# table = pa.Table.from_pandas(df, schema=schema)


# # Write the PyArrow Table to a Parquet file
# parquet_file = 'my_Parquet-file.parquet'
# pq.write_table(table, parquet_file)

# # Read the Parquet file back into a PyArrow Table
# table_read = pq.read_table(parquet_file)

# # Convert the PyArrow Table back to a Pandas DataFrame
# df_read = table_read.to_pandas()

# # Display the DataFrame
# print("Data read from Parquet file with schema:")
# print(df_read)


Create Ontology from Parquet schema and perform a SPARQL query

In [3]:
# === Step 1: Create and read the Parquet file ===
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Define the schema for the Parquet file
schema = pa.schema([
    ('Name', pa.string()),
    ('Age', pa.int32()),
    ('City', pa.string())
])

# Sample data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}
df = pd.DataFrame(data)

# Create a PyArrow Table and write to a Parquet file
table = pa.Table.from_pandas(df, schema=schema)
parquet_file = 'my_Parquet-file.parquet'
pq.write_table(table, parquet_file)

# Read the Parquet file back into a Pandas DataFrame
table_read = pq.read_table(parquet_file)
df_read = table_read.to_pandas()

print("Data read from Parquet file:")
print(df_read)

# === Step 2: Create an ontology based solely on the Parquet file ===
from owlready2 import get_ontology, Thing, DataProperty, FunctionalProperty

# Create an ontology (the base IRI can be any URI you choose)
onto = get_ontology("http://example.org/parquet_ontology.owl#")

with onto:
    # Define a generic class for the records; here we call it "Record"
    class Record(Thing):
        pass

    # Define properties for each column in the DataFrame
    class hasName(DataProperty, FunctionalProperty):
        domain = [Record]
        range = [str]

    class hasAge(DataProperty, FunctionalProperty):
        domain = [Record]
        range = [int]

    class hasCity(DataProperty, FunctionalProperty):
        domain = [Record]
        range = [str]

    # Create individuals (instances of Record) from each row in the DataFrame
    for index, row in df_read.iterrows():
        # Use a unique identifier for each individual (e.g., "Record_0", "Record_1", etc.)
        individual = Record(f"Record_{index}")
        individual.hasName = row['Name']
        individual.hasAge = row['Age']
        individual.hasCity = row['City']

# Save the ontology to an OWL file (RDF/XML format)
ontology_file = "parquet_ontology.owl"
onto.save(file=ontology_file, format="rdfxml")
print(f"\nOntology saved to {ontology_file}")

# === Step 3: Load the ontology with rdflib and run a SPARQL query ===
from rdflib import Graph, Namespace
from rdflib.namespace import XSD

# Load the saved ontology file
g = Graph()
g.parse(ontology_file)

# Define a namespace for the ontology
EX = Namespace("http://example.org/parquet_ontology.owl#")
g.bind("ex", EX)

# SPARQL query to mimic the DataFrame filter: select records with Age > 36
sparql_query = """
PREFIX ex: <http://example.org/parquet_ontology.owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?record ?name ?age ?city
WHERE {
    ?record a ex:Record .
    ?record ex:hasName ?name .
    ?record ex:hasAge ?age .
    ?record ex:hasCity ?city .
    FILTER(xsd:integer(?age) > 20)
}
"""

results = g.query(sparql_query)

print("\nSPARQL query results (Records with Age > 36):")
for row in results:
    print(f"Record: {row.record}, Name: {row.name}, Age: {row.age}, City: {row.city}")

Data read from Parquet file:
      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40      Houston

Ontology saved to parquet_ontology.owl

SPARQL query results (Records with Age > 36):
Record: http://example.org/parquet_ontology.owl#Record_0, Name: Alice, Age: 25, City: New York
Record: http://example.org/parquet_ontology.owl#Record_1, Name: Bob, Age: 30, City: Los Angeles
Record: http://example.org/parquet_ontology.owl#Record_2, Name: Charlie, Age: 35, City: Chicago
Record: http://example.org/parquet_ontology.owl#Record_3, Name: David, Age: 40, City: Houston


In [None]:
# from owlready2 import *
# from rdflib import Graph

# # Step 1: Create the ontology using owlready2
# def create_ontology_with_owlready2():
#     # Create a new ontology
#     onto = get_ontology("http://example.org/ontology#")

#     # Define the Person class
#     # class Person(Thing):
#         namespace = onto

#     # Define properties
#     class hasName(DataProperty, FunctionalProperty):
#         namespace = onto
#         domain = [Person]
#         range = [str]

#     class hasAge(DataProperty, FunctionalProperty):
#         namespace = onto
#         domain = [Person]
#         range = [int]

#     class hasCity(DataProperty, FunctionalProperty):
#         namespace = onto
#         domain = [Person]
#         range = [str]

#     # Sample data
#     data = {
#         'Name': ['Alice', 'Bob', 'Charlie', 'David'],
#         'Age': [25, 30, 35, 40],
#         'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
#     }

#     # Add individuals to the ontology
#     for i in range(len(data['Name'])):
#         person = Person(name=data['Name'][i])  # Create an individual
#         person.hasName = data['Name'][i]       # Assign hasName property
#         person.hasAge = data['Age'][i]         # Assign hasAge property
#         person.hasCity = data['City'][i]       # Assign hasCity property

#     # Save the ontology to an OWL file
#     onto.save(file="my_Parquet_ontology.owl", format="rdfxml")
#     print("Ontology created and saved to 'my_Parquet_ontology.owl'")

# # Step 2: Query the ontology using rdflib
# def query_ontology_with_rdflib():
#     # Load the ontology into an RDF graph
#     g = Graph()
#     g.parse("my_Parquet_ontology.owl", format="xml")

#     # Define a SPARQL query to retrieve all persons and their details
#     sparql_query = """
#     PREFIX ex: <http://example.org/ontology#>
#     SELECT ?person ?name ?age ?city
#     WHERE {
#         ?person a ex:Person .
#         ?person ex:hasName ?name .
#         ?person ex:hasAge ?age .
#         ?person ex:hasCity ?city .
#     }
#     """

#     # Execute the SPARQL query
#     results = g.query(sparql_query)

#     # Print the results
#     print("\nResults from SPARQL query:")
#     for row in results:
#         print(f"Person: {row.person}, Name: {row.name}, Age: {row.age}, City: {row.city}")

# # Main program
# if __name__ == "__main__":
#     # Step 1: Create the ontology
#     create_ontology_with_owlready2()

#     # Step 2: Query the ontology
#     query_ontology_with_rdflib()