In [None]:
# !pip install "unstructured[all-docs]" pillow lxml pillow chromadb tiktoken langchain langchain-community python_dotenv langchain-google-genai langchain-openai google-generativeai ipykernel langchain-groq

In [None]:
import os
os.environ["GOOGLE_API_KEY"] = ""



In [1]:
from unstructured.partition.pdf import partition_pdf

file_path = 'Alexendra_Lopez_Resume.pdf'

# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
chunks = partition_pdf(
    filename=file_path,
    infer_table_structure=True,            # extract tables
    strategy="hi_res",                     # mandatory to infer tables

    extract_image_block_types=["Image"],   # Add 'Table' to list to extract image of tables
    # image_output_dir_path=output_path,   # if None, images and tables will saved in base64

    extract_image_block_to_payload=True,   # if true, will extract base64 for API usage

    chunking_strategy="by_title",          # or 'basic'
    max_characters=10000,                  # defaults to 500
    combine_text_under_n_chars=2000,       # defaults to 0
    new_after_n_chars=6000,

    # extract_images_in_pdf=True,          # deprecated
)

  from .autonotebook import tqdm as notebook_tqdm




The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [2]:
set([str(type(el)) for el in chunks])

{"<class 'unstructured.documents.elements.CompositeElement'>"}

In [3]:
chunks[0].metadata.orig_elements

[<unstructured.documents.elements.Title at 0x731d262bc470>,
 <unstructured.documents.elements.NarrativeText at 0x731d262bc260>,
 <unstructured.documents.elements.Title at 0x731d262bc740>,
 <unstructured.documents.elements.NarrativeText at 0x731d262bc890>,
 <unstructured.documents.elements.Title at 0x731d262bc860>,
 <unstructured.documents.elements.NarrativeText at 0x731c807f2ab0>,
 <unstructured.documents.elements.NarrativeText at 0x731d26263740>,
 <unstructured.documents.elements.ListItem at 0x731d2629f5f0>,
 <unstructured.documents.elements.NarrativeText at 0x731d2629f860>,
 <unstructured.documents.elements.ListItem at 0x731d2629fef0>,
 <unstructured.documents.elements.ListItem at 0x731d2629fb60>,
 <unstructured.documents.elements.ListItem at 0x731d2629f800>,
 <unstructured.documents.elements.NarrativeText at 0x731d2629ffe0>,
 <unstructured.documents.elements.NarrativeText at 0x731d2629cb30>,
 <unstructured.documents.elements.ListItem at 0x731d2629f350>,
 <unstructured.documents.elem

In [4]:
elements = chunks[3].metadata.orig_elements
chunk_images = [el for el in elements if 'Image' in str(type(el))]
chunk_images[0].to_dict()

IndexError: list index out of range

In [5]:
# separate tables from texts
tables = []
texts = []

for chunk in chunks:
    if "Table" in str(type(chunk)):
        tables.append(chunk)

    if "CompositeElement" in str(type((chunk))):
        texts.append(chunk)

In [6]:
# Get the images from the CompositeElement objects
def get_images_base64(chunks):
    images_b64 = []
    for chunk in chunks:
        if "CompositeElement" in str(type(chunk)):
            chunk_els = chunk.metadata.orig_elements
            for el in chunk_els:
                if "Image" in str(type(el)):
                    images_b64.append(el.metadata.image_base64)
    return images_b64

images = get_images_base64(chunks)

In [8]:
texts

[<unstructured.documents.elements.CompositeElement at 0x731d263206e0>,
 <unstructured.documents.elements.CompositeElement at 0x731d26355c40>]

In [7]:
import base64
from IPython.display import Image, display

def display_base64_image(base64_code):
    # Decode the base64 string to binary
    image_data = base64.b64decode(base64_code)
    # Display the image
    display(Image(data=image_data))

display_base64_image(images[0])

IndexError: list index out of range

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [12]:
# Prompt template
prompt = ChatPromptTemplate.from_template("""
You are an assistant tasked with summarizing tables and text.
Give a concise summary of the table or text.

Respond only with the summary, no additional comment.
Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is.

Table or text chunk:
{content}
""")

# Model
model = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0.3)

# Chain
summarize_chain = prompt | model | StrOutputParser()


In [13]:
inputs = [{"content": t} for t in texts]

# Run batch with concurrency control
text_summaries = summarize_chain.batch(inputs, config={"max_concurrency": 3})

In [14]:

# Summarize tables
tables_html = [{"content":table.metadata.text_as_html} for table in tables]
table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 3})

In [15]:
text_summaries

['Alexendra Lopez is a Senior AI Engineer with five years of experience in building and deploying scalable AI/ML systems, natural language processing, and deep learning pipelines. Her career progression includes roles from intern to senior engineer, where she has led teams, architected an LLM-based chatbot for over 50,000 users, optimized inference pipelines, and developed models for predictive analytics, computer vision, and NLP. She is skilled in the end-to-end machine learning lifecycle, including distributed training, model optimization for edge deployment, and creating scalable data pipelines.',
 "A Stanford Data Science Master's and UC Berkeley Computer Science graduate specializing in NLP, LLMs, and Computer Vision. Proficient in Python, SQL, TensorFlow, and PyTorch, with experience across cloud platforms like AWS, GCP, and Azure. Key projects include building a real-time transcription system, an AI healthcare assistant for clinical note summarization, and an autonomous drone vi

In [22]:
table_summaries

[]

In [27]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI

# Prompt
prompt_template = """Describe the image in detail. For context,
the image is part of a research paper explaining the transformers architecture.
Be specific about graphs, such as bar plots."""

# Build prompt messages
messages = [
    (
        "user",
        [
            {"type": "text", "text": prompt_template},
            {
                "type": "image_url",
                "image_url": {"url": "data:image/jpeg;base64,{image}"},
            },
        ],
    )
]

prompt = ChatPromptTemplate.from_messages(messages)

# Gemini model
model = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.3)

# Chain
chain = prompt | model | StrOutputParser()

# Batch process images (list of base64 strings)
inputs = [{"image": img} for img in images]
image_summaries = chain.batch(inputs, config={"max_concurrency": 3})


In [28]:
image_summaries

['The image depicts the architecture of a Transformer model, showcasing the flow of data and operations within both the encoder (left) and decoder (right) stacks.  Each stack is represented within a rounded rectangle.\n\n**Encoder (Left):**\n\n1. **Inputs:**  Raw input data feeds into the model.\n2. **Input Embedding:** The input data is transformed into vector representations.\n3. **Positional Encoding:** Positional information is added to the embeddings, as transformers don\'t inherently understand sequence order. This is represented by a swirling symbol merging with the embedding output via a \'+\' symbol.\n4. **N×:** This indicates that the following block is repeated N times, signifying multiple encoder layers.\n5. **Multi-Head Attention:** This block performs self-attention, allowing the model to weigh the importance of different parts of the input sequence when encoding each word.\n6. **Add & Norm:**  This represents a residual connection followed by layer normalization. The out

In [16]:
import uuid
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="multi_modal_rag",
    embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001"),
)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)


  vectorstore = Chroma(


In [17]:
try:
    # Add texts
    doc_ids = [str(uuid.uuid4()) for _ in texts]
    summary_texts = [
        Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
    ]
    retriever.vectorstore.add_documents(summary_texts)
    retriever.docstore.mset(list(zip(doc_ids, texts)))
except Exception as e:
    print(f"Error adding texts: {e}")

try:
    # Add tables
    table_ids = [str(uuid.uuid4()) for _ in tables]
    summary_tables = [
        Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)
    ]
    retriever.vectorstore.add_documents(summary_tables)
    retriever.docstore.mset(list(zip(table_ids, tables)))
except Exception as e:
    print(f"Error adding tables: {e}")

try:
    # Add image summaries
    img_ids = [str(uuid.uuid4()) for _ in images]
    summary_img = [
        Document(page_content=summary, metadata={id_key: img_ids[i]}) for i, summary in enumerate(image_summaries)
    ]
    retriever.vectorstore.add_documents(summary_img)
    retriever.docstore.mset(list(zip(img_ids, images)))
except Exception as e:
    print(f"Error adding images: {e}")

Error adding tables: Expected Embeddings to be non-empty list or numpy array, got [] in upsert.
Error adding images: name 'image_summaries' is not defined


In [18]:
# Retrieve
docs = retriever.invoke(
    "name of candidate?"
)

In [19]:
for doc in docs:
    print(str(doc) + "\n\n" + "-" * 80)

Education

Master of Science in Data Science – Stanford University | 2017 – 2019 Bachelor of Science in Computer Science – University of California, Berkeley | 2013 – 2017

Technical Skills

Languages: Python, Java, C++, SQL, R ML/DL Frameworks: TensorFlow, PyTorch, Scikit-learn, Hugging Face Specializations: NLP, LLMs, Computer Vision, Reinforcement Learning Tools & Platforms: AWS, GCP, Azure, Docker, Kubernetes, MLflow, Ray Databases: PostgreSQL, MongoDB, BigQuery

Projects

* Multi-Language Real-Time Transcription System - Built streaming ASR pipeline with Whisper + WebRTC.

*Al-Powered Healthcare Assistant - Deployed an NLP system for clinical notes summarization.

«Autonomous Drone Vision Model - Created CNN models for real-time aerial object detection.

--------------------------------------------------------------------------------
Alexendra Lopez

San Francisco, CA | alexendra.lopez@email.com | (123) 456-7890 linkedin.com/in/alexendralopez | github.com/alexendralopez

Professio

In [20]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from base64 import b64decode


def parse_docs(docs):
    """Split base64-encoded images and texts"""
    b64 = []
    text = []
    for doc in docs:
        try:
            b64decode(doc)
            b64.append(doc)
        except Exception:
            text.append(doc)
    return {"images": b64, "texts": text}


def build_prompt(kwargs):
    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]

    context_text = ""
    if len(docs_by_type["texts"]) > 0:
        for text_element in docs_by_type["texts"]:
            context_text += text_element.text

    # construct prompt with context (including images)
    prompt_template = f"""
    Answer the question based only on the following context, which can include text, tables, and the below image.
    Context: {context_text}
    Question: {user_question}
    """

    prompt_content = [{"type": "text", "text": prompt_template}]

    if len(docs_by_type["images"]) > 0:
        for image in docs_by_type["images"]:
            prompt_content.append(
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image}"},
                }
            )

    return ChatPromptTemplate.from_messages([HumanMessage(content=prompt_content)])


# Use Gemini instead of OpenAI
gemini_model = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.3)

chain = (
    {
        "context": retriever | RunnableLambda(parse_docs),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(build_prompt)
    | gemini_model
    | StrOutputParser()
)

chain_with_sources = (
    {
        "context": retriever | RunnableLambda(parse_docs),
        "question": RunnablePassthrough(),
    }
    | RunnablePassthrough().assign(
        response=(
            RunnableLambda(build_prompt)
            | gemini_model
            | StrOutputParser()
        )
    )
)


In [23]:
response = chain.invoke(
    "name of companies the candidate worked with and their roles?"
)

print(response)

* **TechNova AI Solutions:** Senior AI Engineer
* **InnovaSoft:** AI Engineer
* **NextGen Robotics:** Machine Learning Engineer
* **BrightData Analytics:** AI/ML Engineer
* **CloudSphere Technologies:** Junior Machine Learning Engineer
* **Visionary Labs:** Machine Learning Intern


In [24]:
response = chain_with_sources.invoke(
    "name of companies the candidate worked with and their roles?"
)

print("Response:", response['response'])

print("\n\nContext:")
for text in response['context']['texts']:
    print(text.text)
    print("Page number: ", text.metadata.page_number)
    print("\n" + "-"*50 + "\n")
for image in response['context']['images']:
    display_base64_image(image)

Response: * **TechNova AI Solutions:** Senior AI Engineer
* **InnovaSoft:** AI Engineer
* **NextGen Robotics:** Machine Learning Engineer
* **BrightData Analytics:** AI/ML Engineer
* **CloudSphere Technologies:** Junior Machine Learning Engineer
* **Visionary Labs:** Machine Learning Intern


Context:
Education

Master of Science in Data Science – Stanford University | 2017 – 2019 Bachelor of Science in Computer Science – University of California, Berkeley | 2013 – 2017

Technical Skills

Languages: Python, Java, C++, SQL, R ML/DL Frameworks: TensorFlow, PyTorch, Scikit-learn, Hugging Face Specializations: NLP, LLMs, Computer Vision, Reinforcement Learning Tools & Platforms: AWS, GCP, Azure, Docker, Kubernetes, MLflow, Ray Databases: PostgreSQL, MongoDB, BigQuery

Projects

* Multi-Language Real-Time Transcription System - Built streaming ASR pipeline with Whisper + WebRTC.

*Al-Powered Healthcare Assistant - Deployed an NLP system for clinical notes summarization.

«Autonomous Drone V