In [31]:
#env setup
import getpass
import os
from dotenv import load_dotenv

#get env setup
load_dotenv('nb.env', override=True)

if not os.environ.get('NEO4J_URI'):
    os.environ['NEO4J_URI'] = getpass.getpass('NEO4J_URI:\n')
if not os.environ.get('NEO4J_USERNAME'):
    os.environ['NEO4J_USERNAME'] = getpass.getpass('NEO4J_USERNAME:\n')
if not os.environ.get('NEO4J_PASSWORD'):
    os.environ['NEO4J_PASSWORD'] = getpass.getpass('NEO4J_PASSWORD:\n')

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

In [32]:
#read pdfs into text with unique id
from person import get_short_id
from pypdf import PdfReader


#read pdfs into text with unique id
def read_resumes_from_directory(directory="resume-pdfs"):
    """
    Read all PDFs from a directory and return a list of text strings
    """
    resumes = []

    # Check if directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return resumes

    # Get all PDF files from the directory
    pdf_files = [f for f in os.listdir(directory) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in '{directory}'.")
        return resumes

    # Process each PDF file
    for pdf_file in pdf_files:
        pdf_path = os.path.join(directory, pdf_file)

        try:
            # Extract text from PDF
            text = ""
            # IMPORTANT Create a unique id for each person.
            # In this case we can do it by file name but may vary by use case
            # It is important to think about how things are identified in entity extraction
            # so that they get properly resolved in the graph
            # Usually you do not want to use names.
            person_id = get_short_id(pdf_file)
            with open(pdf_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text()

            resumes.append({'id': person_id, 'text': text})
            print(f"Processed: {pdf_file} ({len(text)} characters)")
        except Exception as e:
            print(f"Error with {pdf_file}: {str(e)}")

    print(f"Total resumes loaded: {len(resumes)}")
    return resumes

resumes = read_resumes_from_directory()
resumes[0]

Processed: resume_9_amanda_foster.pdf (1878 characters)
Processed: resume_1_sarah_chen.pdf (1485 characters)
Processed: resume_8_monica_garcia.pdf (1702 characters)
Processed: resume_7_robert_johnson.pdf (1682 characters)
Processed: resume_5_david_kim.pdf (1548 characters)
Processed: resume_3_jennifer_park.pdf (1575 characters)
Processed: resume_4_alex_thompson.pdf (1439 characters)
Processed: resume_10_james_mitchell.pdf (2065 characters)
Processed: resume_2_marcus_rodriguez.pdf (1342 characters)
Processed: resume_6_lisa_wang.pdf (1509 characters)
Total resumes loaded: 10


{'id': 'UhZn6uYW',
 'text': 'Dr. Amanda Foster\nResearch Scientist\nEmail: amanda.foster@email.com\nLocation: Cambridge, MA\nExperience: 8 years\nProfessional Summary\nAI research scientist with 8 years experience in computer vision and natural language processing. PhD\nin Computer Science with focus on deep learning applications.\nProfessional Experience\nPrincipal Research Scientist | AI Research Institute | 2021 - Present\n- Published 15 peer-reviewed papers on computer vision and NLP, cited 500+ times in academic\nliterature\n- Built state-of-the-art image recognition model achieving top-3 accuracy on ImageNet benchmark\n- Led research team of 6 PhD students and postdocs investigating multimodal AI systems\nResearch Scientist | Tech Giant Research Lab | 2018 - 2021\n- Developed novel neural network architecture for language understanding, open-sourced for research\ncommunity\n- Shipped research prototype to production, serving millions of users with real-time image analysis\n- Coll

In [33]:
import pandas as pd
from tqdm import tqdm
from typing import List, Dict
#add embeddings
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model='text-embedding-ada-002')


def chunks(xs, n=10):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]

def batch_embed_resumes(resumes:List[Dict[str,str]], embedding_model, chunk_size=10) -> List[Dict[str,str]]:
    df = pd.DataFrame(resumes)
    embeddings = []
    # Use tqdm to show progress during embedding generation
    for chunk in tqdm(chunks(df['text'], n=chunk_size), desc="Processing embedding chunks"):
        # Generate embeddings for each chunk and extend the embeddings list
        embeddings.extend(embedding_model.embed_documents(chunk))

    # combine and output
    df['embedding'] = embeddings

    #print("[Embedding] Process completed successfully.")
    return df.to_dict('records')

resumes_with_embeddings = batch_embed_resumes(resumes, embedding_model)
resumes_with_embeddings[0]

Processing embedding chunks: 100%|██████████| 1/1 [00:00<00:00,  1.71it/s]


{'id': 'UhZn6uYW',
 'text': 'Dr. Amanda Foster\nResearch Scientist\nEmail: amanda.foster@email.com\nLocation: Cambridge, MA\nExperience: 8 years\nProfessional Summary\nAI research scientist with 8 years experience in computer vision and natural language processing. PhD\nin Computer Science with focus on deep learning applications.\nProfessional Experience\nPrincipal Research Scientist | AI Research Institute | 2021 - Present\n- Published 15 peer-reviewed papers on computer vision and NLP, cited 500+ times in academic\nliterature\n- Built state-of-the-art image recognition model achieving top-3 accuracy on ImageNet benchmark\n- Led research team of 6 PhD students and postdocs investigating multimodal AI systems\nResearch Scientist | Tech Giant Research Lab | 2018 - 2021\n- Developed novel neural network architecture for language understanding, open-sourced for research\ncommunity\n- Shipped research prototype to production, serving millions of users with real-time image analysis\n- Coll

In [34]:
from neo4j import GraphDatabase

# load into People nodes in Neo4j

#instantiate driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

#test neo4j connection
driver.execute_query("MATCH(n) RETURN count(n)")

EagerResult(records=[<Record count(n)=0>], summary=<neo4j._work.summary.ResultSummary object at 0x123510290>, keys=['count(n)'])

In [35]:
from neo4j import RoutingControl

#create uniqueness constraint if not exists
driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

#load
for chunk in chunks(resumes_with_embeddings):
    records = driver.execute_query(
        """
        UNWIND $records AS rec
        MERGE(n:Person {id:rec.id})
        SET n.text = rec.text
        WITH n, rec
        CALL db.create.setNodeVectorProperty(n, 'embedding', rec.embedding)
        RETURN count(rec) AS records_upserted
        """,
        #database_=DATABASE,
        routing_=RoutingControl.WRITE,
        result_transformer_= lambda r: r.data(),
        records = chunk
    )
    print(records)

# vector index
driver.execute_query('''
CREATE VECTOR INDEX text_embeddings IF NOT EXISTS FOR (n:Person) ON (n.embedding)
OPTIONS {indexConfig: {
 `vector.dimensions`: toInteger($dimension),
 `vector.similarity_function`: 'cosine'
}}
''', dimension=len(resumes_with_embeddings[0]["embedding"]))

# wait for index to come online
driver.execute_query('CALL db.awaitIndex("text_embeddings", 300)')

[{'records_upserted': 10}]


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x123519610>, keys=[])

In [36]:
from typing import Any
from langchain_neo4j import Neo4jVector

# build langgraph agent with vector search

#define tool
vector_store = Neo4jVector.from_existing_graph(
    embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="text_embeddings",
    node_label="Person",
    text_node_properties=["text"],
    embedding_node_property="embedding",
)

def search_documents(search_prompt:str) -> List[Dict[str, Any]]:
    """
    Retrieval knowledge by searching people resumes
    """
    try:
        results = vector_store.similarity_search(search_prompt, k=5)
        return results
    except Exception as e:
        return [{"error":str(e)}]
#test
search_documents("AI Research")[:2]



[Document(metadata={}, page_content='\ntext: Dr. Amanda Foster\nResearch Scientist\nEmail: amanda.foster@email.com\nLocation: Cambridge, MA\nExperience: 8 years\nProfessional Summary\nAI research scientist with 8 years experience in computer vision and natural language processing. PhD\nin Computer Science with focus on deep learning applications.\nProfessional Experience\nPrincipal Research Scientist | AI Research Institute | 2021 - Present\n- Published 15 peer-reviewed papers on computer vision and NLP, cited 500+ times in academic\nliterature\n- Built state-of-the-art image recognition model achieving top-3 accuracy on ImageNet benchmark\n- Led research team of 6 PhD students and postdocs investigating multimodal AI systems\nResearch Scientist | Tech Giant Research Lab | 2018 - 2021\n- Developed novel neural network architecture for language understanding, open-sourced for research\ncommunity\n- Shipped research prototype to production, serving millions of users with real-time image 

In [37]:
from langchain_core.messages import HumanMessage
from langgraph.prebuilt import create_react_agent
#define langgraph agent
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-4.1', temperature=0)



agent_executor = create_react_agent(llm,
                                    tools=[search_documents],
                                    prompt = """
                                    You are an assistant with access to people data for the new AI tech startup SkyNet.""")
agent_executor.invoke({"messages": [HumanMessage(content="hi!")]})

{'messages': [HumanMessage(content='hi!', additional_kwargs={}, response_metadata={}, id='7636e671-a610-4e1a-ad16-0203eebee249'),
  AIMessage(content='Hello! How can I help you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 67, 'total_tokens': 77, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-2025-04-14', 'system_fingerprint': 'fp_51e1070cf2', 'id': 'chatcmpl-BdiRod5Zr8ubq5CNrgWiM06StZWpl', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--025a2b80-62db-44f9-8fbe-c8e305bab5c7-0', usage_metadata={'input_tokens': 67, 'output_tokens': 10, 'total_tokens': 77, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})]}

In [38]:
# ask some questions
def ask_to_agent(question):
    for step in agent_executor.stream(
        {"messages": [HumanMessage(content=question)]},
        stream_mode="values",
    ):
        step["messages"][-1].pretty_print()

In [39]:
ask_to_agent("Who is a good Python developer for my next AI chatbot project?")


Who is a good Python developer for my next AI chatbot project?
Tool Calls:
  search_documents (call_O1b1MLbhj0RaXBOU5BtGrW5K)
 Call ID: call_O1b1MLbhj0RaXBOU5BtGrW5K
  Args:
    search_prompt: Python developer with experience in AI chatbot projects
Name: search_documents

[Document(metadata={}, page_content='\ntext: Sarah Chen\nSenior AI Engineer\nEmail: sarah.chen@email.com\nLocation: San Francisco, CA\nExperience: 6 years\nProfessional Summary\nExperienced AI engineer with 6 years building production machine learning systems. Currently leading\nAI initiatives at TechCorp, previously at DataFlow and StartupAI.\nProfessional Experience\nSenior AI Engineer | TechCorp | 2022 - Present\n- Built fraud detection system serving 50M+ daily transactions using Python and TensorFlow, reducing\nfalse positives by 40%\n- Led 8-person AI engineering team developing recommendation engine, increasing user engagement\nby 25%\n- Architected ML infrastructure platform supporting 20+ data science teams 

In [40]:
ask_to_agent("How many Python developers do I have?")


How many Python developers do I have?
Tool Calls:
  search_documents (call_inZXv0fMNSjRZeNigf3ZBxi9)
 Call ID: call_inZXv0fMNSjRZeNigf3ZBxi9
  Args:
    search_prompt: number of employees with Python development experience
Name: search_documents

[Document(metadata={}, page_content='\ntext: Jennifer Park\nData Engineering Manager\nEmail: jennifer.park@email.com\nLocation: Seattle, WA\nExperience: 10 years\nProfessional Summary\nResults-driven data engineering leader with 10 years experience building enterprise data platforms.\nExpert in big data technologies and team management.\nProfessional Experience\nData Engineering Manager | CloudData Corp | 2020 - Present\n- Managed 12-person data engineering team building real-time analytics platform processing 1TB+\ndaily\n- Led cloud migration project moving 200+ data sources to AWS, completing 2 months ahead of\nschedule\n- Architected data warehouse serving business intelligence needs for 500+ analysts and executives\nSenior Data Engineer 