In [25]:
import getpass
import os
# 'QhdyPvS28vnpVY94g5YKmm58PgEhUCYb'
os.environ["MISTRAL_API_KEY"] = getpass.getpass()

In [26]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
#from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

#### Load the PDF

In [27]:
from langchain.document_loaders import PyPDFLoader

pdf_file_path = "ArtificialIntelligenceAct.pdf"

loader = PyPDFLoader(pdf_file_path)

# Load and process the document
docs = loader.load()

#### Create the Chunks, generate the embeddings and store them
The model we used for generating the embeddings was the ``all-MiniLM-L6-v2`` of Hugging Face.

Then we stored those embeddings in a vectorial data base (Chroma).


In [28]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"))

In [29]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

"""Prompt template:
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

    Question: {question} 

    Context: {context} 

    Answer:
"""



"Prompt template:\n    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\n\n    Question: {question} \n\n    Context: {context} \n\n    Answer:\n"

#### Modelo : MistralAI através da Langchain

In [30]:
from langchain_mistralai import ChatMistralAI

llm = ChatMistralAI(model="open-mistral-7b", api_key=os.environ["MISTRAL_API_KEY"])

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Pipeline de RAG
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

#rag_chain.invoke("What are the main values enshrined in the AI Act, according to Union law?")
#rag_chain.invoke("What are the primary objectives of the Artificial Intelligence Act?")
rag_chain.invoke("How does the AI Act aim to regulate high-risk AI systems?")

"The AI Act aims to regulate high-risk AI systems by requiring operators to comply with its requirements by specific deadlines. Operators of high-risk AI systems intended for public use must comply by six years after the Regulation's entry into force. Providers of these systems are encouraged to start complying during the transitional period. For systems placed on the market or put into service before certain deadlines, compliance is required if significant changes occur."