In [1]:
import os
import glob

from dotenv import load_dotenv

load_dotenv()

True

In [2]:
vectorstore_path = "../vectorstore_faiss"

In [3]:
import getpass

if not os.environ.get("AZURE_OPENAI_API_KEY"):
    os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter API key for Azure: ")

from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings

model = AzureChatOpenAI(
    azure_endpoint="https://menoua.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview",
    azure_deployment="gpt-41",
    api_version="2025-01-01-preview",
    temperature=0,
    use_responses_api=False,
)

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    chunk_size=64,
    # max_retries=10,
    # retry_min_seconds=10,
    # retry_max_seconds=60,
)

# BDDv FAISS

In [9]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

embedding_dim = len(embeddings.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim)

vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

1536


In [5]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader


# Only keep post title, headers, and content from the full HTML.
# bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
# loader = WebBaseLoader(
#     web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
#     bs_kwargs={"parse_only": bs4_strainer},
# )

pdf_files = glob.glob(os.path.join("resources", "*.pdf"))
all_docs = []

for pdf_path in pdf_files:
    loader = PyPDFLoader(pdf_path)
    # loader = PyPDFLoader(pdf_path, mode="single")
    docs = loader.load()
    all_docs.extend(docs)

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [6]:
import pprint

print(len(docs))
print(len(all_docs))
# pprint.pp(docs[0].metadata)

107
129


In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    # add_start_index=True,
)

all_splits = text_splitter.split_documents(all_docs)

print(f"Split all docs into {len(all_splits)} sub-documents.")

Split all docs into 609 sub-documents.


In [8]:
""" si la bdd existe déjà
vectorstore = FAISS.load_local(
    vectorstore_path,
    embeddings,
    allow_dangerous_deserialization=True
)
"""

vectorstore = FAISS.from_documents(documents=all_splits, embedding=embeddings)
vectorstore.save_local(vectorstore_path)

# RAG

In [37]:
from langchain.tools import tool
from typing import Literal

@tool(response_format="content_and_artifact")
def retrieve_context(query: str, section: Literal["Nike", "AdoptAI"]):
    """Retrieve information to help answer a query."""
    retrieved_docs = vectorstore.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return section, retrieved_docs

In [42]:
from langchain.agents import create_agent

tools = [retrieve_context]
# If desired, specify custom instructions
prompt = (
    "You have access to a tool that retrieves context from various documents. "
    "Use the tool to help answer user queries."
)
agent = create_agent(model, tools, system_prompt=prompt)

In [43]:
query = (
    "Qu'a dit Christophe Périllat à propos de Valeo?\n"
)

for event in agent.stream(
        {"messages": [{"role": "user", "content": query}]},
        stream_mode="values",
):
    event["messages"][-1].pretty_print()


Qu'a dit Christophe Périllat à propos de Valeo?

Tool Calls:
  retrieve_context (call_hyWtLHZi9otXib29JI3ZBq1N)
 Call ID: call_hyWtLHZi9otXib29JI3ZBq1N
  Args:
    query: Christophe Périllat propos Valeo
    section: AdoptAI
Name: retrieve_context

AdoptAI

Christophe Périllat, actuel directeur général de Valeo, a récemment déclaré que Valeo est engagé dans une transformation profonde pour répondre aux défis de la mobilité du futur, notamment l’électrification, la conduite autonome et la digitalisation. Il a souligné l’importance de l’innovation et de l’agilité pour permettre à Valeo de rester un leader technologique dans l’industrie automobile. Périllat insiste sur la capacité de Valeo à anticiper les évolutions du marché et à accompagner ses clients dans la transition énergétique.

Si vous souhaitez une citation précise ou des détails sur une intervention particulière, n’hésitez pas à préciser le contexte ou la date.


## Appel unique

In [40]:
from langchain.agents.middleware import dynamic_prompt, ModelRequest

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    """Inject context into state messages."""
    last_query = request.state["messages"][-1].text
    retrieved_docs = vectorstore.similarity_search(last_query)

    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

    system_message = (
        "You are a helpful assistant. Use the following context in your response:"
        f"\n\n{docs_content}"
    )

    return system_message


agent = create_agent(model, tools=[], middleware=[prompt_with_context])

In [41]:
query = "Quel est le numéro IRS d'identification de l'employeur de Nike?"
for step in agent.stream(
        {"messages": [{"role": "user", "content": query}]},
        stream_mode="values",
):
    step["messages"][-1].pretty_print()


Quel est le numéro IRS d'identification de l'employeur de Nike?

Le numéro d'identification de l'employeur (IRS Employer Identification No.) de Nike, Inc. est **93-0584541**.
