# Basic Vector Search and Agents

In [2]:
#env setup
import getpass
import os
from dotenv import load_dotenv

#get env setup
load_dotenv('nb.env', override=True)

if not os.environ.get('NEO4J_URI'):
    os.environ['NEO4J_URI'] = getpass.getpass('NEO4J_URI:\n')
if not os.environ.get('NEO4J_USERNAME'):
    os.environ['NEO4J_USERNAME'] = getpass.getpass('NEO4J_USERNAME:\n')
if not os.environ.get('NEO4J_PASSWORD'):
    os.environ['NEO4J_PASSWORD'] = getpass.getpass('NEO4J_PASSWORD:\n')

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

In [4]:
#read pdfs into text with unique id
from person import get_short_id
from pypdf import PdfReader


#read pdfs into text with unique id
def read_resumes_from_directory(directory="resume-pdfs"):
    """
    Read all PDFs from a directory and return a list of text strings
    """
    resumes = []

    # Check if directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return resumes

    # Get all PDF files from the directory
    pdf_files = [f for f in os.listdir(directory) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in '{directory}'.")
        return resumes

    # Process each PDF file
    for pdf_file in pdf_files:
        pdf_path = os.path.join(directory, pdf_file)

        try:
            # Extract text from PDF
            text = ""
            # IMPORTANT Create a unique id for each person.
            # In this case we can do it by file name but may vary by use case
            # It is important to think about how things are identified in entity extraction
            # so that they get properly resolved in the graph
            # Usually you do not want to use names.
            person_id = get_short_id(pdf_file)
            with open(pdf_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text()

            resumes.append({'id': person_id, 'text': text})
            print(f"Processed: {pdf_file} ({len(text)} characters)")
        except Exception as e:
            print(f"Error with {pdf_file}: {str(e)}")

    print(f"Total resumes loaded: {len(resumes)}")
    return resumes

resumes = read_resumes_from_directory()
resumes[0]

Processed: resume_9_amanda_foster.pdf (1878 characters)
Processed: resume_24_fatima_al_zahra.pdf (1800 characters)
Processed: resume_29_kai_wong.pdf (1962 characters)
Processed: resume_17_miguel_santos.pdf (1864 characters)
Processed: resume_14_emily_chen.pdf (1849 characters)
Processed: resume_30_omar_ibrahim.pdf (1923 characters)
Processed: resume_1_sarah_chen.pdf (1485 characters)
Processed: resume_15_ahmed_hassan.pdf (1695 characters)
Processed: resume_8_monica_garcia.pdf (1702 characters)
Processed: resume_7_robert_johnson.pdf (1682 characters)
Processed: resume_25_yuki_matsuda.pdf (1748 characters)
Processed: resume_28_isabella_rossi.pdf (1865 characters)
Processed: resume_5_david_kim.pdf (1548 characters)
Processed: resume_13_kenji_tanaka.pdf (1561 characters)
Processed: resume_3_jennifer_park.pdf (1575 characters)
Processed: resume_4_alex_thompson.pdf (1439 characters)
Processed: resume_10_james_mitchell.pdf (2065 characters)
Processed: resume_26_elena_popov.pdf (1771 character

{'id': 'UhZn6uYW',
 'text': 'Dr. Amanda Foster\nResearch Scientist\nEmail: amanda.foster@email.com\nLocation: Cambridge, MA\nExperience: 8 years\nProfessional Summary\nAI research scientist with 8 years experience in computer vision and natural language processing. PhD\nin Computer Science with focus on deep learning applications.\nProfessional Experience\nPrincipal Research Scientist | AI Research Institute | 2021 - Present\n- Published 15 peer-reviewed papers on computer vision and NLP, cited 500+ times in academic\nliterature\n- Built state-of-the-art image recognition model achieving top-3 accuracy on ImageNet benchmark\n- Led research team of 6 PhD students and postdocs investigating multimodal AI systems\nResearch Scientist | Tech Giant Research Lab | 2018 - 2021\n- Developed novel neural network architecture for language understanding, open-sourced for research\ncommunity\n- Shipped research prototype to production, serving millions of users with real-time image analysis\n- Coll

In [5]:
import pandas as pd
from tqdm import tqdm
from typing import List, Dict
#add embeddings
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model='text-embedding-ada-002')


def chunks(xs, n=10):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]

def batch_embed_resumes(resumes:List[Dict[str,str]], embedding_model, chunk_size=10) -> List[Dict[str,str]]:
    df = pd.DataFrame(resumes)
    embeddings = []
    # Use tqdm to show progress during embedding generation
    for chunk in tqdm(chunks(df['text'], n=chunk_size), desc="Processing embedding chunks"):
        # Generate embeddings for each chunk and extend the embeddings list
        embeddings.extend(embedding_model.embed_documents(chunk))

    # combine and output
    df['embedding'] = embeddings

    #print("[Embedding] Process completed successfully.")
    return df.to_dict('records')

resumes_with_embeddings = batch_embed_resumes(resumes, embedding_model)
resumes_with_embeddings[0]

Processing embedding chunks: 100%|██████████| 3/3 [00:01<00:00,  1.86it/s]


{'id': 'UhZn6uYW',
 'text': 'Dr. Amanda Foster\nResearch Scientist\nEmail: amanda.foster@email.com\nLocation: Cambridge, MA\nExperience: 8 years\nProfessional Summary\nAI research scientist with 8 years experience in computer vision and natural language processing. PhD\nin Computer Science with focus on deep learning applications.\nProfessional Experience\nPrincipal Research Scientist | AI Research Institute | 2021 - Present\n- Published 15 peer-reviewed papers on computer vision and NLP, cited 500+ times in academic\nliterature\n- Built state-of-the-art image recognition model achieving top-3 accuracy on ImageNet benchmark\n- Led research team of 6 PhD students and postdocs investigating multimodal AI systems\nResearch Scientist | Tech Giant Research Lab | 2018 - 2021\n- Developed novel neural network architecture for language understanding, open-sourced for research\ncommunity\n- Shipped research prototype to production, serving millions of users with real-time image analysis\n- Coll

In [6]:
from neo4j import GraphDatabase

# load into People nodes in Neo4j

#instantiate driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

#test neo4j connection
driver.execute_query("MATCH(n) RETURN count(n)")

EagerResult(records=[<Record count(n)=342>], summary=<neo4j._work.summary.ResultSummary object at 0x11c102a90>, keys=['count(n)'])

In [7]:
from neo4j import RoutingControl

#create uniqueness constraint if not exists
driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

#load
for chunk in chunks(resumes_with_embeddings):
    records = driver.execute_query(
        """
        UNWIND $records AS rec
        MERGE(n:Person {id:rec.id})
        SET n.text = rec.text
        WITH n, rec
        CALL db.create.setNodeVectorProperty(n, 'embedding', rec.embedding)
        RETURN count(rec) AS records_upserted
        """,
        #database_=DATABASE,
        routing_=RoutingControl.WRITE,
        result_transformer_= lambda r: r.data(),
        records = chunk
    )
    print(records)

# vector index
driver.execute_query('''
CREATE VECTOR INDEX text_embeddings IF NOT EXISTS FOR (n:Person) ON (n.embedding)
OPTIONS {indexConfig: {
 `vector.dimensions`: toInteger($dimension),
 `vector.similarity_function`: 'cosine'
}}
''', dimension=len(resumes_with_embeddings[0]["embedding"]))

# wait for index to come online
driver.execute_query('CALL db.awaitIndex("text_embeddings", 300)')

[{'records_upserted': 10}]
[{'records_upserted': 10}]
[{'records_upserted': 10}]


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x11c12fb50>, keys=[])

In [8]:
from typing import Any
from langchain_neo4j import Neo4jVector

# build langgraph agent with vector search

#define tool
vector_store = Neo4jVector.from_existing_graph(
    embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="text_embeddings",
    node_label="Person",
    text_node_properties=["text"],
    embedding_node_property="embedding",
)

def search_documents(search_prompt:str) -> List[Dict[str, Any]]:
    """
    Retrieval knowledge by searching people resumes
    """
    try:
        results = vector_store.similarity_search(search_prompt, k=5)
        return results
    except Exception as e:
        return [{"error":str(e)}]
#test
search_documents("AI Research")[:2]

[Document(metadata={'current_title': 'Senior AI Ethics Researcher', 'level': 'Senior', 'location': 'Rome, Italy', 'email': 'isabella.rossi@email.com', 'department': 'Data Science', 'name': 'Isabella Rossi', 'years_experience': 4}, page_content='\ntext: Isabella Rossi\nAI Ethics Researcher\nEmail: isabella.rossi@email.com\nLocation: Rome, Italy\nExperience: 4 years\nProfessional Summary\nAI ethics researcher with 4 years experience studying fairness and bias in machine learning systems.\nExpert in Python analysis, research methodology, and AI governance.\nProfessional Experience\nSenior AI Ethics Researcher | European AI Institute | 2022 - Present\n- Led research on algorithmic bias detection using Python and statistical analysis across 50+ AI\nsystems\n- Published 8 papers on AI fairness and ethics at top-tier AI conferences (FAccT, AIES, ICML)\n- Built open-source bias evaluation toolkit adopted by 100+ AI research teams globally\nAI Ethics Researcher | Tech Ethics Lab | 2021 - 2022\n

In [9]:
# build adk agent with neo4j mcp
from google.adk.models.lite_llm import LiteLlm
from google.adk.agents import Agent
from google.adk.runners import InMemoryRunner
from google.genai.types import Part, UserContent

database_agent = Agent(
    name="graph_database_agent",
    # model="gemini-2.0-flash-exp",
    model=LiteLlm(model="openai/gpt-4.1"),
    # model=LiteLlm(model="anthropic/claude-sonnet-4-20250514"),
    description="""
    Agent to access knowledge graph stored in graph database
    """,
    instruction="""You are an assistant with access to people data for the new AI tech startup SkyNet.""",
    tools=[search_documents]
)

APP_NAME = 'Database Agent'
USER_ID = 'Zach Blumenfeld'


runner = InMemoryRunner(app_name=APP_NAME, agent=database_agent)

session = await runner.session_service.create_session( app_name=runner.app_name, user_id=USER_ID)

async def run_prompt(new_message: str):
  content = UserContent(parts=[Part(text=new_message)])
  result = None
  async for event in runner.run_async(user_id=session.user_id, session_id=session.id, new_message=content):
    for part in event.content.parts:
      print(part.text, part.function_call, part.function_response)
      if part.text:
        result = part.text
  return result

In [10]:
from IPython.display import Markdown, display

res = await run_prompt("Who is a good Python developer for my next AI chatbot project?")
print("\n\n\n\nFinal Response:")
display(Markdown(res))

None id='call_mKTHCBnwjLvje0fWiXTGLezU' args={'search_prompt': 'Python developer with experience in AI chatbot'} name='search_documents' None
None None will_continue=None scheduling=None id='call_mKTHCBnwjLvje0fWiXTGLezU' name='search_documents' response={'result': [Document(metadata={'current_title': 'Full-Stack AI Engineer', 'level': 'Senior', 'location': 'Barcelona, Spain', 'email': 'lucas.martinez@email.com', 'department': 'Engineering', 'name': 'Lucas Martinez', 'years_experience': 4}, page_content='\ntext: Lucas Martinez\nFull-Stack AI Engineer\nEmail: lucas.martinez@email.com\nLocation: Barcelona, Spain\nExperience: 4 years\nProfessional Summary\nFull-stack engineer with 4 years experience building end-to-end AI applications. Expert in Python,\nJavaScript, and integrating machine learning with web applications.\nProfessional Experience\nSenior Full-Stack AI Engineer | EdTech Startup | 2022 - Present\n- Built personalized learning platform using Python Flask backend and React fro

Here are some excellent Python developers at SkyNet who would be great candidates for your next AI chatbot project:

1. Aisha Patel – Principal NLP Research Scientist (Bangalore, India)
   - 7 years of Python & NLP experience
   - Built conversational AI systems using Python, PyTorch, and transformer architectures
   - Led multilingual chatbot initiatives and published 12+ papers in the field
   - Email: aisha.patel@email.com

2. Kenji Tanaka – AI Research Engineer (Tokyo, Japan)
   - 4 years experience specializing in Python-based machine learning and NLP
   - Developed and deployed large-scale conversational AI systems using Python, PyTorch, and transformers
   - Experience bridging academic research with production systems
   - Email: kenji.tanaka@email.com

3. Lucas Martinez – Full-Stack AI Engineer (Barcelona, Spain)
   - 4 years of full-stack engineering, expert in Python, NLP, and integrating ML with web apps
   - Built an AI tutoring chatbot with Python backend and NLP
   - Email: lucas.martinez@email.com

Each of these developers has strong Python skills and direct experience with AI and chatbots. If you'd like more information on any of them, let me know!

In [11]:
res = await run_prompt("How many Python developers do I have?")
print("\n\n\n\nFinal Response:")
display(Markdown(res))

None id='call_zQYdVm1tICXFWtkERrG11cfr' args={'search_prompt': 'Python developer'} name='search_documents' None
None None will_continue=None scheduling=None id='call_zQYdVm1tICXFWtkERrG11cfr' name='search_documents' response={'result': [Document(metadata={'current_title': 'Full-Stack AI Engineer', 'level': 'Senior', 'location': 'Barcelona, Spain', 'email': 'lucas.martinez@email.com', 'department': 'Engineering', 'name': 'Lucas Martinez', 'years_experience': 4}, page_content='\ntext: Lucas Martinez\nFull-Stack AI Engineer\nEmail: lucas.martinez@email.com\nLocation: Barcelona, Spain\nExperience: 4 years\nProfessional Summary\nFull-stack engineer with 4 years experience building end-to-end AI applications. Expert in Python,\nJavaScript, and integrating machine learning with web applications.\nProfessional Experience\nSenior Full-Stack AI Engineer | EdTech Startup | 2022 - Present\n- Built personalized learning platform using Python Flask backend and React frontend\n- Developed AI tutoring

You currently have at least five Python developers at SkyNet, including:

1. Lucas Martinez – Full-Stack AI Engineer
2. Miguel Santos – Backend Engineering Manager
3. Kai Wong – Database Performance Engineer
4. Yuki Matsuda – Data Platform Engineer
5. Aisha Patel – NLP Research Scientist

Let me know if you want details or a full list of their profiles and skill sets!

In [12]:
res = await run_prompt("Who should be on our new AI tiger team where we will use Google ADK and Langchain to make a chatbot? What Are the Skill Gaps?")
print("\n\n\n\nFinal Response:")
display(Markdown(res))

None id='call_H0mYTGAfMinqMFMkn0AZhUVu' args={'search_prompt': 'experience with Google ADK'} name='search_documents' None
None id='call_qR4ECDO6ezq0C6wQ9bKuWPm4' args={'search_prompt': 'experience with Langchain'} name='search_documents' None
None None will_continue=None scheduling=None id='call_H0mYTGAfMinqMFMkn0AZhUVu' name='search_documents' response={'result': [Document(metadata={'current_title': 'Mobile Platform Architect', 'level': 'Senior', 'location': 'Toronto, ON', 'email': 'ahmed.hassan@email.com', 'department': 'Engineering', 'name': 'Ahmed Hassan', 'years_experience': 9}, page_content='\ntext: Ahmed Hassan\nMobile Platform Architect\nEmail: ahmed.hassan@email.com\nLocation: Toronto, ON\nExperience: 9 years\nProfessional Summary\nMobile platform architect with 9 years experience building cross-platform mobile applications and\nbackend systems. Expert in iOS, Android, and mobile infrastructure.\nProfessional Experience\nMobile Platform Architect | Banking Corp | 2020 - Presen

Based on available expertise, here’s who should be on your AI tiger team to build a chatbot with Google ADK and Langchain:

Core Recommended Members:
- Kenji Tanaka (AI Research Engineer): Extensive experience building conversational AI systems in Python; strong NLP and ML background. (No explicit Langchain/Google ADK but strong relevant domain.)
- Aisha Patel (Principal NLP Research Scientist): Led multilingual conversational AI and NLP projects; expert in Python, PyTorch, and transformers.
- Ahmed Hassan (Mobile Platform Architect): Senior-level experience integrating Python with Android; skilled in backend and mobile systems (Google ADK/Android expertise).
- Yuki Matsuda (Data Platform Engineer): Strong Python developer with real-time, cloud, and data pipeline expertise (will help if chatbot requires backend/data infra).
- David Kim (DevOps Platform Engineer): Significant experience in infrastructure, automations, and keeping production AI systems reliable (helpful for deploying chatbot at scale).

Skill Gaps:
- Langchain: No direct mention of Langchain proficiency among current staff, so you may need external expertise or upskilling in this Python framework for LLM-based chatbot orchestration.
- Google ADK: Ahmed Hassan brings mobile/Android/Google-related background, but specific experience with the latest Google ADK for conversational AI is not directly indicated. Some upskilling/training may be needed unless a more specialized ADK professional can be brought in.
- No explicit mention of UX/UI conversational design or prompt engineering for LLMs.
- If you require voice integration or rich media chatbot features, you may need additional frontend or media processing talent.

Summary:
You have top-tier AI, NLP, Python, and backend/cloud engineering, but will likely need to close gaps on direct Langchain expertise, hands-on experience with Google ADK for bots, and possibly conversational UI/UX. Consider professional development or temporary external hires for these areas.

In [13]:
res = await run_prompt("Who shares the most similar types of accomplishments to our PhDs but isn't themselves a PhD?")
print("\n\n\n\nFinal Response:")
display(Markdown(res))

None id='call_oyML7UaVaBkFZ4Py8zb415a1' args={'search_prompt': 'accomplishments similar to PhD researchers but without a PhD'} name='search_documents' None
None None will_continue=None scheduling=None id='call_oyML7UaVaBkFZ4Py8zb415a1' name='search_documents' response={'result': [Document(metadata={'current_title': 'Principal Research Scientist', 'level': 'Principal', 'location': 'Cambridge, MA', 'email': 'amanda.foster@email.com', 'department': 'Data Science', 'name': 'Dr. Amanda Foster', 'years_experience': 8}, page_content='\ntext: Dr. Amanda Foster\nResearch Scientist\nEmail: amanda.foster@email.com\nLocation: Cambridge, MA\nExperience: 8 years\nProfessional Summary\nAI research scientist with 8 years experience in computer vision and natural language processing. PhD\nin Computer Science with focus on deep learning applications.\nProfessional Experience\nPrincipal Research Scientist | AI Research Institute | 2021 - Present\n- Published 15 peer-reviewed papers on computer vision and

Among SkyNet staff, Emily Chen (Data Science Manager) shares the most similar types of accomplishments to your PhDs, despite not holding a PhD herself. Her key qualifications and comparables:

- 8 years of data science and machine learning experience, with responsibility for managing a team and leading AI-powered product development.
- Led the development of advanced AI systems (e.g., fraud detection preventing $50M+ in losses, personalized recommendation engines).
- Published applied research on deep learning at leading analytics conferences, mirroring the academic publishing track record typical for PhDs.
- Recognized with industry awards, such as the "Rising Star Award" for analytical skills and impact.
- Deep technical skills in Python, machine learning, and analytics, and leadership roles aligning with academic research/mentorship experience.

While Emily does not have a doctorate, her real-world innovation, publication record, and team leadership strongly mirror the profile and accomplishments of your leading PhD researchers.