## Download data

In [0]:
- Download the file from the Google Drive link provided in the [article](https://medium.com/@ianormy/microsoft-graphrag-with-an-rdf-knowledge-graph-part-2-d8d291a39ed1#2383)


## Prepare Graph DB


1. Create a project named `msft-graphrag` in your workspace.
2. Import `msft-graphrag.owl` and `msft-graphrag-300-chunk-size.ttl` from the `msft-graphrag-ttl.zip` file.
3. Obtain the Project URL, Username, and Password for the project and set them in `00_config`.

In [0]:
%pip install SPARQLWrapper openai -q
dbutils.library.restartPython()

In [0]:
%run ./00_config

In [0]:
import pandas as pd
from io import StringIO

from SPARQLWrapper import SPARQLWrapper, CSV

def sparql_query(sparql_conn ,query: str,):
    sparql_conn.setQuery(query)
    sparql_conn.setReturnFormat(CSV)
    results = sparql_conn.query().convert()
    return pd.read_csv(StringIO(results.decode('utf-8')), sep=",")

In [0]:
endpoint = f"{host}/repositories/{repository}"     # ← GraphDB のクエリエンドポイント

In [0]:
query = """
select * where {
    ?s ?p ?o .
} limit 10
"""
sparql_conn = SPARQLWrapper(endpoint)
sparql_conn.setCredentials(username, password)
display(sparql_query(sparql_conn, query))

## Databricks Setup

In [0]:
%pip install databricks-vectorsearch -q
dbutils.library.restartPython()

In [0]:
%run ./00_config

In [0]:
# Create catalog
_ = spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")

In [0]:
# Create Schema
_ = spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")

In [0]:
## Create Volume and Prepare Source Data

In [0]:
VOLUME = "volume_01"

In [0]:
# Create Volume
_ = spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.{VOLUME}")

## Create Mosaic AI Vector Search endpoint

In [0]:
# Create a Mosaic AI Vector Search endpoint. This may take about 10 minutes.
from databricks.vector_search.client import VectorSearchClient

client = VectorSearchClient()

try:
    client.get_endpoint(name=ENDPOINT_NAME)
    print(f"[VS] endpoint '{ENDPOINT_NAME}' は既存。ONLINE待機…")
    client.wait_for_endpoint(name=ENDPOINT_NAME)
except Exception:
    print(f"[VS] endpoint '{ENDPOINT_NAME}' が未作成。作成します…")
    client.create_endpoint_and_wait(
        name=ENDPOINT_NAME,
        endpoint_type="STANDARD",
    )

## Create Mosaic AI Vector Search Index

In [0]:
import os
import pandas as pd
import json

from databricks.vector_search.client import VectorSearchClient
from pyspark.sql.functions import col, expr
from pyspark.sql import types as T

In [0]:
graphrag_folder = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}"
SOURCE_TABLE = "create_final_entities"
source_table_fqn = f"{CATALOG}.{SCHEMA}.{SOURCE_TABLE}"

In [0]:
sdf = spark.read.format("parquet").load(os.path.join(graphrag_folder, 'create_final_entities.parquet'))
sdf.display()

In [0]:
# Create the source table
(
    sdf.write.mode("overwrite")
       .option("overwriteSchema", "true")
       .format("delta")
       .saveAsTable(source_table_fqn)
)
spark.sql(f"ALTER TABLE {source_table_fqn} SET TBLPROPERTIES (delta.enableChangeDataFeed=true)")

In [0]:
index_fqn = f"{CATALOG}.{SCHEMA}.{INDEX_NAME}"

In [0]:
# Create the index
vsc = VectorSearchClient()
index = vsc.create_delta_sync_index_and_wait(
    endpoint_name=ENDPOINT_NAME,
    source_table_name=source_table_fqn,
    index_name=index_fqn,
    pipeline_type="TRIGGERED",
    primary_key="id",
    embedding_dimension=EMBED_DIM,
    embedding_vector_column="description_embedding",
    columns_to_sync=["id", "description_embedding"],
)

In [0]:
# Check index metadata (such as count and status)
desc = index.describe()
print("Index description:", json.dumps(desc, ensure_ascii=False, indent=2))

In [0]:
# end