In [2]:
# %pip install --upgrade --quiet  langchain langchain-community langchain-openai langchain-experimental neo4j wikipedia tiktoken yfiles_jupyter_graphs

In [3]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Tuple, List, Optional
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_community.graphs import Neo4jGraph
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_core.runnables import ConfigurableField, RunnableParallel, RunnablePassthrough

try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
import os
import pandas as pd
from dotenv import load_dotenv
from neo4j import GraphDatabase
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.retrievers import VectorRetriever
from neo4j_graphrag.generation import GraphRAG
from neo4j_graphrag.embeddings import OpenAIEmbeddings
import numpy as np

In [5]:
load_dotenv()
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("GRAPHRAG_LLM_API_KEY")

# ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_DIR = r"/home/sinju/Documents/Nextwave/RAG/test_rag/output"
INDEX_NAME = "text_units"

In [6]:

# Setup driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

# Setup embedder
embedder = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model="text-embedding-3-small")

In [7]:

def save_to_neo4j():
    print("\n--- Loading Graph into Neo4j ---")

    # Load parquet files
    entities_path = os.path.join(OUTPUT_DIR, "entities.parquet")
    rels_path = os.path.join(OUTPUT_DIR, "relationships.parquet")

    if not os.path.exists(entities_path) or not os.path.exists(rels_path):
        print("Error: Graph files not found in /output. Run the indexing pipeline first.")
        return

    entities_df = pd.read_parquet(entities_path).dropna(subset=["title"])
    rels_df = pd.read_parquet(rels_path).dropna(subset=["source", "target"])

    # --- Normalize titles for matching ---
    def normalize_text(s):
        if pd.isna(s):
            return None
        return str(s).strip().upper()  # remove spaces, uppercase

    entities_df['title'] = entities_df['title'].apply(normalize_text)
    rels_df['source'] = rels_df['source'].apply(normalize_text)
    rels_df['target'] = rels_df['target'].apply(normalize_text)

    # --- Convert array fields to list for Neo4j ---
    for col in ['text_unit_ids']:
        if col in rels_df.columns:
            rels_df[col] = rels_df[col].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

    entities_records = entities_df.astype(object).where(pd.notna(entities_df), None).to_dict("records")
    rels_records = rels_df.astype(object).where(pd.notna(rels_df), None).to_dict("records")

    with driver.session() as session:
        # Create unique constraint on title
        session.run("CREATE CONSTRAINT entity_title IF NOT EXISTS FOR (e:Entity) REQUIRE e.title IS UNIQUE")

        # Ingest entities
        print(f"Ingesting {len(entities_records)} entities...")
        session.run(
            """
            UNWIND $rows AS row
            MERGE (e:Entity {title: row.title})
            SET e += apoc.map.clean(row, ['title'], [])
            """,
            rows=entities_records
        )

        # Ingest relationships using title
        print(f"Ingesting {len(rels_records)} relationships...")
        failed_rels = []
        for row in rels_records:
            try:
                result = session.run(
                    """
                    MATCH (s:Entity {title: $source})
                    MATCH (t:Entity {title: $target})
                    MERGE (s)-[r:RELATIONSHIP {id: $id}]->(t)
                    SET r += apoc.map.clean($props, ['id', 'source', 'target'], [])
                    RETURN r
                    """,
                    {"source": row["source"], "target": row["target"], "id": row["id"], "props": row}
                )
                if result.single() is None:
                    failed_rels.append(row)
            except Exception as e:
                failed_rels.append({"row": row, "error": str(e)})

        if failed_rels:
            print(f"Warning: {len(failed_rels)} relationships failed to insert:")
            for f in failed_rels[:5]:
                print(f)
        else:
            print("--- All relationships ingested successfully ---")





In [8]:

# --- Save text units ---
def save_text_units(parquet_file="output/text_units.parquet"):
    df = pd.read_parquet(parquet_file)
    with driver.session() as session:
        for _, row in df.iterrows():
            embedding = embedder.embed_query(row["text"]) if embedder else []
            session.run(
                """
                MERGE (t:TextUnit {id: $id})
                SET t.text = $text, t.embedding = $embedding
                """,
                {"id": row["id"], "text": row["text"], "embedding": embedding}
            )
    print(f"--- Ingested {len(df)} text units into Neo4j ---")


In [9]:

# --- Create vector index ---
def create_vector_index(index_name="text_units"):
    with driver.session() as session:
        session.run(f"""
            CREATE VECTOR INDEX {index_name} IF NOT EXISTS
            FOR (t:TextUnit) ON (t.embedding)
        """)
    print(f"--- Vector index '{index_name}' created (or already exists) ---")



In [10]:

def run_query_debug(query="Whats the age of elizabeth called"):
    retriever = VectorRetriever(driver, INDEX_NAME, embedder)
    llm = OpenAILLM(model_name="gpt-4o-mini", api_key=OPENAI_API_KEY)
    rag = GraphRAG(retriever=retriever, llm=llm)

    # Perform RAG search
    response = rag.search(query_text=query, retriever_config={"top_k": 5})

    # LLM answer
    print("\n--- LLM-Generated Answer ---")
    print(response.answer)

    # Print the context nodes used by the LLM
    if hasattr(response, "source_nodes") and response.source_nodes:
        print("\n--- Context Passed to LLM ---")
        for i, node in enumerate(response.source_nodes, 1):
            # each node is a dictionary containing at least 'text'
            text = node.get("text") if isinstance(node, dict) else str(node)
            print(f"{i}. {text}\n")



In [11]:
save_to_neo4j()
save_text_units("output/text_units.parquet")
create_vector_index(INDEX_NAME)
run_query_debug()
driver.close()


--- Loading Graph into Neo4j ---
Ingesting 326 entities...
Ingesting 350 relationships...
--- All relationships ingested successfully ---
--- Ingested 27 text units into Neo4j ---
--- Vector index 'text_units' created (or already exists) ---

--- LLM-Generated Answer ---
The age of Elizabeth I is referred to as the "Elizabethan era."


In [15]:
# directly show the graph resulting from the given Cypher query
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t"

def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    # display(widget)
    return widget

showGraph()

GraphWidget(layout=Layout(height='800px', width='100%'))

In [12]:
import requests
from datetime import datetime, timezone, timedelta


date = (datetime.now(timezone.utc) - timedelta(days=1)).date()

url = f"https://api.openai.com/v1/usage?date={date}"

headers = {
    "Authorization": f"Bearer {OPENAI_API_KEY}"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    usage_data = response.json()
    print(f"API Usage for {date}:")
    print(usage_data)
else:
    print("Error:", response.status_code, response.text)

API Usage for 2025-08-23:
{'object': 'list', 'data': [{'organization_id': 'org-XOIaTvOT7o7SoUAZ0MxFItSW', 'organization_name': 'THE NYCG', 'aggregation_timestamp': 1755946800, 'n_requests': 1, 'operation': 'completion', 'snapshot_id': 'gpt-4o-mini-2024-07-18', 'n_context_tokens_total': 470, 'n_generated_tokens_total': 115, 'email': None, 'api_key_id': None, 'api_key_name': None, 'api_key_redacted': None, 'api_key_type': None, 'project_id': None, 'project_name': None, 'request_type': '', 'n_cached_context_tokens_total': 0, 'n_context_audio_tokens_total': 0, 'n_generated_audio_tokens_total': 0}], 'ft_data': [], 'dalle_api_data': [], 'whisper_api_data': [], 'tts_api_data': [], 'assistant_code_interpreter_data': [], 'retrieval_storage_data': []}
