In [1]:
#env setup
import getpass
import os
from dotenv import load_dotenv

#get env setup
load_dotenv('nb.env', override=True)

if not os.environ.get('NEO4J_URI'):
    os.environ['NEO4J_URI'] = getpass.getpass('NEO4J_URI:\n')
if not os.environ.get('NEO4J_USERNAME'):
    os.environ['NEO4J_USERNAME'] = getpass.getpass('NEO4J_USERNAME:\n')
if not os.environ.get('NEO4J_PASSWORD'):
    os.environ['NEO4J_PASSWORD'] = getpass.getpass('NEO4J_PASSWORD:\n')

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

In [2]:
#read pdfs into text with unique id
from person import get_short_id
from pypdf import PdfReader


#read pdfs into text with unique id
def read_resumes_from_directory(directory="resume-pdfs"):
    """
    Read all PDFs from a directory and return a list of text strings
    """
    resumes = []

    # Check if directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return resumes

    # Get all PDF files from the directory
    pdf_files = [f for f in os.listdir(directory) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in '{directory}'.")
        return resumes

    # Process each PDF file
    for pdf_file in pdf_files:
        pdf_path = os.path.join(directory, pdf_file)

        try:
            # Extract text from PDF
            text = ""
            # IMPORTANT Create a unique id for each person.
            # In this case we can do it by file name but may vary by use case
            # It is important to think about how things are identified in entity extraction
            # so that they get properly resolved in the graph
            # Usually you do not want to use names.
            person_id = get_short_id(pdf_file)
            with open(pdf_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text()

            resumes.append({'id': person_id, 'text': text})
            print(f"Processed: {pdf_file} ({len(text)} characters)")
        except Exception as e:
            print(f"Error with {pdf_file}: {str(e)}")

    print(f"Total resumes loaded: {len(resumes)}")
    return resumes

resumes = read_resumes_from_directory()
resumes[0]

Processed: resume_9_amanda_foster.pdf (1878 characters)
Processed: resume_24_fatima_al_zahra.pdf (1800 characters)
Processed: resume_29_kai_wong.pdf (1962 characters)
Processed: resume_17_miguel_santos.pdf (1864 characters)
Processed: resume_14_emily_chen.pdf (1849 characters)
Processed: resume_30_omar_ibrahim.pdf (1923 characters)
Processed: resume_1_sarah_chen.pdf (1485 characters)
Processed: resume_15_ahmed_hassan.pdf (1695 characters)
Processed: resume_8_monica_garcia.pdf (1702 characters)
Processed: resume_7_robert_johnson.pdf (1682 characters)
Processed: resume_25_yuki_matsuda.pdf (1748 characters)
Processed: resume_28_isabella_rossi.pdf (1865 characters)
Processed: resume_5_david_kim.pdf (1548 characters)
Processed: resume_13_kenji_tanaka.pdf (1561 characters)
Processed: resume_3_jennifer_park.pdf (1575 characters)
Processed: resume_4_alex_thompson.pdf (1439 characters)
Processed: resume_10_james_mitchell.pdf (2065 characters)
Processed: resume_26_elena_popov.pdf (1771 character

{'id': 'UhZn6uYW',
 'text': 'Dr. Amanda Foster\nResearch Scientist\nEmail: amanda.foster@email.com\nLocation: Cambridge, MA\nExperience: 8 years\nProfessional Summary\nAI research scientist with 8 years experience in computer vision and natural language processing. PhD\nin Computer Science with focus on deep learning applications.\nProfessional Experience\nPrincipal Research Scientist | AI Research Institute | 2021 - Present\n- Published 15 peer-reviewed papers on computer vision and NLP, cited 500+ times in academic\nliterature\n- Built state-of-the-art image recognition model achieving top-3 accuracy on ImageNet benchmark\n- Led research team of 6 PhD students and postdocs investigating multimodal AI systems\nResearch Scientist | Tech Giant Research Lab | 2018 - 2021\n- Developed novel neural network architecture for language understanding, open-sourced for research\ncommunity\n- Shipped research prototype to production, serving millions of users with real-time image analysis\n- Coll

In [3]:
import pandas as pd
from tqdm import tqdm
from typing import List, Dict
#add embeddings
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model='text-embedding-ada-002')


def chunks(xs, n=10):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]

def batch_embed_resumes(resumes:List[Dict[str,str]], embedding_model, chunk_size=10) -> List[Dict[str,str]]:
    df = pd.DataFrame(resumes)
    embeddings = []
    # Use tqdm to show progress during embedding generation
    for chunk in tqdm(chunks(df['text'], n=chunk_size), desc="Processing embedding chunks"):
        # Generate embeddings for each chunk and extend the embeddings list
        embeddings.extend(embedding_model.embed_documents(chunk))

    # combine and output
    df['embedding'] = embeddings

    #print("[Embedding] Process completed successfully.")
    return df.to_dict('records')

resumes_with_embeddings = batch_embed_resumes(resumes, embedding_model)
resumes_with_embeddings[0]

Processing embedding chunks: 100%|██████████| 3/3 [00:00<00:00,  3.27it/s]


{'id': 'UhZn6uYW',
 'text': 'Dr. Amanda Foster\nResearch Scientist\nEmail: amanda.foster@email.com\nLocation: Cambridge, MA\nExperience: 8 years\nProfessional Summary\nAI research scientist with 8 years experience in computer vision and natural language processing. PhD\nin Computer Science with focus on deep learning applications.\nProfessional Experience\nPrincipal Research Scientist | AI Research Institute | 2021 - Present\n- Published 15 peer-reviewed papers on computer vision and NLP, cited 500+ times in academic\nliterature\n- Built state-of-the-art image recognition model achieving top-3 accuracy on ImageNet benchmark\n- Led research team of 6 PhD students and postdocs investigating multimodal AI systems\nResearch Scientist | Tech Giant Research Lab | 2018 - 2021\n- Developed novel neural network architecture for language understanding, open-sourced for research\ncommunity\n- Shipped research prototype to production, serving millions of users with real-time image analysis\n- Coll

In [4]:
from neo4j import GraphDatabase

# load into People nodes in Neo4j

#instantiate driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

#test neo4j connection
driver.execute_query("MATCH(n) RETURN count(n)")

EagerResult(records=[<Record count(n)=0>], summary=<neo4j._work.summary.ResultSummary object at 0x11f36c110>, keys=['count(n)'])

In [5]:
from neo4j import RoutingControl

#create uniqueness constraint if not exists
driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

#load
for chunk in chunks(resumes_with_embeddings):
    records = driver.execute_query(
        """
        UNWIND $records AS rec
        MERGE(n:Person {id:rec.id})
        SET n.text = rec.text
        WITH n, rec
        CALL db.create.setNodeVectorProperty(n, 'embedding', rec.embedding)
        RETURN count(rec) AS records_upserted
        """,
        #database_=DATABASE,
        routing_=RoutingControl.WRITE,
        result_transformer_= lambda r: r.data(),
        records = chunk
    )
    print(records)

# vector index
driver.execute_query('''
CREATE VECTOR INDEX text_embeddings IF NOT EXISTS FOR (n:Person) ON (n.embedding)
OPTIONS {indexConfig: {
 `vector.dimensions`: toInteger($dimension),
 `vector.similarity_function`: 'cosine'
}}
''', dimension=len(resumes_with_embeddings[0]["embedding"]))

# wait for index to come online
driver.execute_query('CALL db.awaitIndex("text_embeddings", 300)')

[{'records_upserted': 10}]
[{'records_upserted': 10}]
[{'records_upserted': 10}]


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x11f38ba10>, keys=[])

In [6]:
from typing import Any
from langchain_neo4j import Neo4jVector

# build langgraph agent with vector search

#define tool
vector_store = Neo4jVector.from_existing_graph(
    embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="text_embeddings",
    node_label="Person",
    text_node_properties=["text"],
    embedding_node_property="embedding",
)

def search_documents(search_prompt:str) -> List[Dict[str, Any]]:
    """
    Retrieval knowledge by searching people resumes
    """
    try:
        results = vector_store.similarity_search(search_prompt, k=5)
        return results
    except Exception as e:
        return [{"error":str(e)}]
#test
search_documents("AI Research")[:2]

[Document(metadata={}, page_content='\ntext: Isabella Rossi\nAI Ethics Researcher\nEmail: isabella.rossi@email.com\nLocation: Rome, Italy\nExperience: 4 years\nProfessional Summary\nAI ethics researcher with 4 years experience studying fairness and bias in machine learning systems.\nExpert in Python analysis, research methodology, and AI governance.\nProfessional Experience\nSenior AI Ethics Researcher | European AI Institute | 2022 - Present\n- Led research on algorithmic bias detection using Python and statistical analysis across 50+ AI\nsystems\n- Published 8 papers on AI fairness and ethics at top-tier AI conferences (FAccT, AIES, ICML)\n- Built open-source bias evaluation toolkit adopted by 100+ AI research teams globally\nAI Ethics Researcher | Tech Ethics Lab | 2021 - 2022\n- Developed fairness metrics for computer vision models using Python and machine learning evaluation\nframeworks\n- Conducted algorithmic audits for government AI procurement identifying bias in 30% of system

In [7]:
# build adk agent with neo4j mcp
from google.adk.models.lite_llm import LiteLlm
from google.adk.agents import Agent
from google.adk.runners import InMemoryRunner
from google.genai.types import Part, UserContent

database_agent = Agent(
    name="graph_database_agent",
    # model="gemini-2.0-flash-exp",
    model=LiteLlm(model="openai/gpt-4.1"),
    # model=LiteLlm(model="anthropic/claude-sonnet-4-20250514"),
    description="""
    Agent to access knowledge graph stored in graph database
    """,
    instruction="""You are an assistant with access to people data for the new AI tech startup SkyNet.""",
    tools=[search_documents]
)

APP_NAME = 'Database Agent'
USER_ID = 'Zach Blumenfeld'


runner = InMemoryRunner(app_name=APP_NAME, agent=database_agent)

session = await runner.session_service.create_session( app_name=runner.app_name, user_id=USER_ID)

async def run_prompt(new_message: str):
  content = UserContent(parts=[Part(text=new_message)])
  result = None
  async for event in runner.run_async(user_id=session.user_id, session_id=session.id, new_message=content):
    for part in event.content.parts:
      print(part.text, part.function_call, part.function_response)
      if part.text:
        result = part.text
  return result

In [8]:
from IPython.display import Markdown, display

res = await run_prompt("Who is a good Python developer for my next AI chatbot project?")
print("\n\n\n\nFinal Response:")
display(Markdown(res))

None id='call_NzWFpAyoL0N48iYzFfs1EwUK' args={'search_prompt': 'Python developer with experience in AI and chatbot projects'} name='search_documents' None
None None will_continue=None scheduling=None id='call_NzWFpAyoL0N48iYzFfs1EwUK' name='search_documents' response={'result': [Document(metadata={}, page_content='\ntext: Lucas Martinez\nFull-Stack AI Engineer\nEmail: lucas.martinez@email.com\nLocation: Barcelona, Spain\nExperience: 4 years\nProfessional Summary\nFull-stack engineer with 4 years experience building end-to-end AI applications. Expert in Python,\nJavaScript, and integrating machine learning with web applications.\nProfessional Experience\nSenior Full-Stack AI Engineer | EdTech Startup | 2022 - Present\n- Built personalized learning platform using Python Flask backend and React frontend\n- Developed AI tutoring system using natural language processing and Python machine learning\n- Shipped adaptive learning algorithm increasing student performance by 50%\nAI Software Engi

Here are some highly qualified Python developers at SkyNet with strong experience in AI and chatbot or conversational systems:

1. Aisha Patel (NLP Research Scientist, Bangalore)
   - 7 years in NLP, Python, and multilingual conversational AI.
   - Built large-scale conversational AI systems using Python and transformer architectures.
   - Led teams developing language models and published extensively at top AI conferences.

2. Kenji Tanaka (AI Research Engineer, Tokyo)
   - 4 years specializing in Python for NLP and chatbot systems.
   - Built conversational AI with transformer models; production experience handling millions of queries daily.
   - Strong background bridging research and product, with a focus on natural language systems.

3. Lucas Martinez (Full-Stack AI Engineer, Barcelona)
   - 4 years in Python-based AI application development.
   - Developed AI tutoring chatbots and adaptive learning platforms using Flask and machine learning.
   - Strong mix of backend, web, and AI skills.

Any of these candidates would be excellent for a Python-based AI chatbot project. If you need recommendations tailored to specific chatbot requirements (multilingual, research focus, production readiness, etc.), let me know!

In [9]:
res = await run_prompt("How many Python developers do I have?")
print("\n\n\n\nFinal Response:")
display(Markdown(res))

None id='call_UeOYRUy4IZLkdKPI4TuZluVR' args={'search_prompt': 'Python developer'} name='search_documents' None
None None will_continue=None scheduling=None id='call_UeOYRUy4IZLkdKPI4TuZluVR' name='search_documents' response={'result': [Document(metadata={}, page_content='\ntext: Lucas Martinez\nFull-Stack AI Engineer\nEmail: lucas.martinez@email.com\nLocation: Barcelona, Spain\nExperience: 4 years\nProfessional Summary\nFull-stack engineer with 4 years experience building end-to-end AI applications. Expert in Python,\nJavaScript, and integrating machine learning with web applications.\nProfessional Experience\nSenior Full-Stack AI Engineer | EdTech Startup | 2022 - Present\n- Built personalized learning platform using Python Flask backend and React frontend\n- Developed AI tutoring system using natural language processing and Python machine learning\n- Shipped adaptive learning algorithm increasing student performance by 50%\nAI Software Engineer | Healthcare Tech | 2021 - 2022\n- Bui

You have at least 5 Python developers at SkyNet:

1. Lucas Martinez – Full-Stack AI Engineer
2. Miguel Santos – Backend Engineering Manager
3. Kai Wong – Database Performance Engineer
4. Yuki Matsuda – Data Platform Engineer
5. Aisha Patel – NLP Research Scientist

Let me know if you’d like details on their specific skills or experience!