### Imports

In [1]:
from src.models.azure_openai import (
    get_openai_chat_llm_gpt4o,
    get_openai_chat_llm_gpt4omini,
)
from src.models.aws_models import (
    get_aws_chat_sonnet,
)
from src.rag.rag_pipeline import (
    get_chat_chain_rerank,
    get_retriever_parent_child,
    get_qa_chain_rerank,
)

In [2]:
retriever, bm25_retriever = get_retriever_parent_child(
    "./data", "BAAI/bge-small-en-v1.5"
)  # Alibaba-NLP/gte-multilingual-base, BAAI/llm-embedder, BAAI/bge-small-en-v1.5

### Vector Store Retreiver

In [8]:
query = "What are the key similarities between GDPR and Brazil's LGPD?"

In [None]:
# Semantic Search

retriever.get_relevant_documents(query)

[Document(metadata={'source': 'data/OJ_L_202401689_EN_TXT.pdf'}, page_content='6. National market surveillance authorities and the national data protection authorities of Member States that have been notified of the use of ‘real-time’ remote biometric identification systems in publicly accessible spaces for law enforcement purposes pursuant to paragraph 4 shall submit to the Commission annual reports on such use. For that purpose, the Commission shall provide Member States and national market surveillance and data protection authorities with a template, including information on the number of the decisions taken by competent judicial authorities or an independent administrative authority whose decision is binding upon requests for authorisations in accordance with paragraph 3 and their result.'),
 Document(metadata={'source': 'data/OJ_L_202401689_EN_TXT.pdf'}, page_content='4.\n\nEntry/Exit System\n\nRegulation (EU) 2017/2226 of the European Parliament and of the Council of 30 November 

In [None]:
# Keyword Search

bm25_retriever.get_relevant_documents(query)



In [None]:
from langchain.retrievers import EnsembleRetriever

# Hybrid Search
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever],
    weights=[1 - 0.5, 0.5],
    search_kwargs={"k": 3},
)
ensemble_retriever.get_relevant_documents(query)

 Document(metadata={'source': 'data/OJ_L_202401689_EN_TXT.pdf'}, page_content='6. National market surveillance authorities and the national data protection authorities of Member States that have been notified of the use of ‘real-time’ remote biometric identification systems in publicly accessible spaces for law enforcement purposes pursuant to paragraph 4 shall submit to the Commission annual reports on such use. For that purpose, the Commission shall provide Member States and national market surveillance and data protection authorities with a template, including information on the number of the decisions taken by competent judicial authorities or an independent administrative authority whose decision is binding upon requests for authorisations in accordance with paragraph 3 and their result.'),
 Document(metadata={'source': 'data/OJ_L_202401689_EN_TXT.pdf'}, page_content='4.\n\nEntry/Exit System\n\nRegulation (EU) 2017/2226 of the European Parliament and of the Council of 30 November 

In [None]:
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers import ContextualCompressionRetriever


# Hybrid Search with Re-Ranker

model = HuggingFaceCrossEncoder(
        model_name="mixedbread-ai/mxbai-rerank-base-v1"
    )  # mixedbread-ai/mxbai-rerank-base-v1,BAAI/bge-reranker-base
compressor = CrossEncoderReranker(model=model, top_n=10)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=ensemble_retriever,
    search_kwargs={"k": 3},
)
compression_retriever.get_relevant_documents(query)

[Document(metadata={'source': 'data/OJ_L_202401689_EN_TXT.pdf'}, page_content='6. National market surveillance authorities and the national data protection authorities of Member States that have been notified of the use of ‘real-time’ remote biometric identification systems in publicly accessible spaces for law enforcement purposes pursuant to paragraph 4 shall submit to the Commission annual reports on such use. For that purpose, the Commission shall provide Member States and national market surveillance and data protection authorities with a template, including information on the number of the decisions taken by competent judicial authorities or an independent administrative authority whose decision is binding upon requests for authorisations in accordance with paragraph 3 and their result.'),
 Document(metadata={'source': 'data/OJ_L_202401689_EN_TXT.pdf'}, page_content='4.\n\nEntry/Exit System\n\nRegulation (EU) 2017/2226 of the European Parliament and of the Council of 30 November 

### Chat Chain

In [None]:
chat_chain = get_chat_chain_rerank(
    retriever, bm25_retriever, 0.5, get_aws_chat_sonnet(0, True)
)

In [None]:
result = chat_chain.invoke(
    {"question": "Hello", "chat_history": []},
)
result

### Question & Answer Chain

In [3]:
qa_chain = get_qa_chain_rerank(
    retriever, bm25_retriever, 0.5, get_aws_chat_sonnet(0, True)
)

In [4]:
result = qa_chain.invoke(
    {"query": "Hello"},
)
result

{'query': 'Hello',
 'result': "Hello! I'm here to help you understand the EU AI Act. Is there something specific about the regulation you'd like to learn more about?",
  Document(metadata={'source': 'data/OJ_L_202401689_EN_TXT.pdf'}, page_content='(101) Providers of general-purpose AI models have a particular role and responsibility along the AI value chain, as the models they provide may form the basis for a range of downstream systems, often provided by downstream providers that necessitate a good understanding of the models and their capabilities, both to enable the integration of such models into their products, and to fulfil their obligations under this or other regulations. Therefore, proportionate transparency measures should be laid down, including the drawing up and keeping up to date of documentation, and the provision of information on the general-purpose AI model for its usage by the downstream providers. Technical documentation should be prepared and kept up to date by'),


### Queries

Fact-Based

1. What is the maximum fine for a GDPR violation?
2. What does GDPR say about the right to be forgotten?
3. Under what conditions can personal data be transferred outside the EU?
4. What is the role of a Data Protection Officer (DPO)?
5. What constitutes a ‘data breach’ under GDPR?
6. What are the lawful bases for processing personal data under GDPR?
7. How long can organizations retain personal data under GDPR?
8. What are the key rights of individuals regarding their personal data?
9. What constitutes valid consent under GDPR?
10. What obligations do data processors have under GDPR?

Abstract

11. How does GDPR define data minimization?
12. Why is GDPR considered a landmark regulation in data privacy?
13. What are the key differences between explicit and implicit consent under GDPR?
14. How does GDPR affect AI-based data processing?

Reasoning-Based

15. If a company stores customer data without informing users, which GDPR articles does it violate?
16. Can a company use personal data without consent if they anonymize it?
17. If a user deletes their account, does GDPR require their data to be erased immediately?

Comparative

18. How does GDPR differ from the California Consumer Privacy Act (CCPA)?
19. How does GDPR handle children’s data protection compared to COPPA?
20. What are the key similarities between GDPR and Brazil’s LGPD?


In [5]:
result = qa_chain.invoke(
    {"query": "What is the maximum fine for a GDPR violation?"},
)
print(result["result"])

According to Article 99 of the EU AI Act, the maximum administrative fines for violations are:

1. Up to €35 million or 7% of total worldwide annual turnover (whichever is higher) for violations of the prohibited AI practices under Article 5

2. Up to €15 million or 3% of total worldwide annual turnover (whichever is higher) for violations of other obligations by operators or notified bodies

3. Up to €7.5 million or 1% of total worldwide annual turnover (whichever is higher) for supplying incorrect/incomplete/misleading information to notified bodies or authorities

However, I notice you specifically asked about GDPR violations, not AI Act violations. The AI Act text does not specify the maximum GDPR fines. For an accurate answer about GDPR fines specifically, you would need to consult the GDPR regulation itself.

I aim to be precise and only answer based on the information contained in the provided text. Since your question is about GDPR fines and not AI Act fines, I should acknowled

In [6]:
for doc in result["source_documents"]:
    print(doc.page_content)
    print(
        "--------------------------------------------------------------------------------------------------------"
    )

(b) whether administrative fines have already been applied by other market surveillance authorities to the same operator for

the same infringement;

(c) whether administrative fines have already been applied by other authorities to the same operator for infringements of other Union or national law, when such infringements result from the same activity or omission constituting a relevant infringement of this Regulation;

(d) the size, the annual turnover and market share of the operator committing the infringement;

(e) any other aggravating or mitigating factor applicable to the circumstances of the case, such as financial benefits gained,

or losses avoided, directly or indirectly, from the infringement;

(f)
--------------------------------------------------------------------------------------------------------
national public authorities or bodies referred to in Article 77(1). Directive (EU) 2016/943 of the European Parliament and of the Council of 8 June 2016 on the protection of 