In [6]:
# Install required packages
!pip install -q llama-index llama-index-packs-raptor llama-index-vector-stores-chroma nest-asyncio pyyaml chromadb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m628.3/628.3 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.6/278.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00

In [7]:
!pip install tenacity



In [8]:
import yaml
import os
import logging
from typing import Dict, Any

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def load_config(config_path: str = 'config.yaml') -> Dict[str, Any]:
    """Load configuration from YAML file."""
    try:
        with open(config_path, 'r') as file:
            config = yaml.safe_load(file)
        return config
    except FileNotFoundError:
        logger.error(f"Configuration file not found: {config_path}")
        raise
    except yaml.YAMLError as e:
        logger.error(f"Error parsing YAML configuration: {e}")
        raise

# Define configuration directly in the notebook
config = {
    "openai_api_key": os.getenv("OPENAI_API_KEY"),
    "models": {
        "embedding": "text-embedding-3-small",
        "llm": "gpt-4o-mini"
    },
    "chunk_size": 400,
    "chunk_overlap": 50,
    "similarity_top_k": 2,
    "mode": "tree_traversal",
    "temperature": 0
}

# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = config['openai_api_key']

In [11]:
# Download the RAPTOR paper
!wget -q https://moitt.gov.pk/SiteImage/Misc/files/Final%20Draft%20Personal%20Data%20Protection%20Bill%20May%202023.pdf -O ./DataLaw2.pdf
logger.info("RAPTOR paper downloaded successfully.")

In [12]:
import nest_asyncio
from llama_index.core import SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

# Apply nest_asyncio to allow asynchronous operations in Jupyter notebooks
nest_asyncio.apply()

def load_document(file_path: str):
    """Load document from file."""
    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
    logger.info(f"Document loaded successfully: {file_path}")
    return documents

def setup_vector_store(db_path: str, collection_name: str):
    """Set up ChromaDB vector store."""
    client = chromadb.PersistentClient(path=db_path)
    collection = client.get_or_create_collection(collection_name)
    vector_store = ChromaVectorStore(chroma_collection=collection)
    logger.info(f"Vector store set up successfully: {collection_name}")
    return vector_store

# Load document
documents = load_document("./DataLaw2.pdf")

# Setup vector store
vector_store = setup_vector_store("./DataLaw2_db", "raptor")

In [13]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.packs.raptor import RaptorPack
from tenacity import retry, wait_exponential, stop_after_attempt

# Add retry decorator to OpenAIEmbedding
@retry(wait=wait_exponential(multiplier=1, min=1, max=10), stop=stop_after_attempt(6))
async def _aget_text_embedding_with_retry(self, text: str):
    """Get text embedding for a single text."""
    return await self._aget_text_embeddings([text])[0]

# Monkey patch the original method
OpenAIEmbedding._aget_text_embedding = _aget_text_embedding_with_retry


def create_raptor_pack(documents, vector_store, config):
    """Create and configure RAPTOR pack."""
    pack = RaptorPack(
        documents,
        embed_model=OpenAIEmbedding(model=config['models']['embedding']),
        llm=OpenAI(model=config['models']['llm'], temperature=config['temperature']),
        vector_store=vector_store,
        similarity_top_k=config['similarity_top_k'],
        mode=config['mode'],
        transformations=[SentenceSplitter(chunk_size=config['chunk_size'], chunk_overlap=config['chunk_overlap'])],
    )
    logger.info("RAPTOR pack created successfully.")
    return pack

# Create RAPTOR pack
raptor_pack = create_raptor_pack(documents, vector_store, config)

Generating embeddings for level 0.
Performing clustering for level 0.
Generating summaries for level 0 with 15 clusters.
Level 0 created summaries/clusters: 15
Generating embeddings for level 1.
Performing clustering for level 1.
Generating summaries for level 1 with 3 clusters.
Level 1 created summaries/clusters: 3
Generating embeddings for level 2.
Performing clustering for level 2.
Generating summaries for level 2 with 1 clusters.
Level 2 created summaries/clusters: 1


### Notes:
- Replace `your_api_key_here` with your OpenAI API key.
- The RAPTOR paper is downloaded and stored locally in the notebook.
- The script initializes and sets up the RAPTOR pack using the `llama-index` library.

In [14]:
from llama_index.packs.raptor import RaptorRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from typing import List, Tuple

def create_raptor_retriever(vector_store, config):
    """Create RAPTOR retriever."""
    retriever = RaptorRetriever(
        [],
        embed_model=OpenAIEmbedding(model=config['models']['embedding']),
        llm=OpenAI(model=config['models']['llm'], temperature=config['temperature']),
        vector_store=vector_store,
        similarity_top_k=config['similarity_top_k'],
        mode=config['mode'],
    )
    logger.info("RAPTOR retriever created successfully.")
    return retriever

def create_query_engine(retriever, config):
    """Create query engine."""
    query_engine = RetrieverQueryEngine.from_args(
        retriever,
        llm=OpenAI(model=config['models']['llm'], temperature=config['temperature'])
    )
    logger.info("Query engine created successfully.")
    return query_engine

def run_multiple_queries(query_engine, queries: List[str]) -> List[Tuple[str, str]]:
    """Run multiple queries and return results."""
    results = []
    for query in queries:
        response = query_engine.query(query)
        results.append((query, str(response)))
        logger.info(f"Query processed successfully: {query}")
    return results

# Create retriever and query engine
retriever = create_raptor_retriever(vector_store, config)
query_engine = create_query_engine(retriever, config)


In [15]:
# Example usage
queries = [
    """
    Extract and list the compliance points that an application's privacy policy should address according to the Pakistan Data Protection Law. Focus on identifying distinct categories or principles. list these compliance points in terms of priority and importance.

    Provide the results in the following structured format:
    - [Category Name 1]: Brief explanation of the principle.
    - [Category Name 2]: Brief explanation of the principle.
    - [Category Name 3]: Brief explanation of the principle.

    Ensure the categories are clearly defined and separate, so they can be used for follow-up queries.
    """
]

results = run_multiple_queries(query_engine, queries)

# Process results to generate follow-up prompts
follow_up_prompts = []

for query, response in results:
    print(f"Query: {query}")
    print(f"Response: {response}")
    print("-" * 50)

    # Extract categories dynamically based on the expected structured format
    categories = []
    for line in response.splitlines():
        if line.startswith("- "):  # Match the expected structure of categories
            category = line.split(":", 1)[0].strip("- ").strip()
            categories.append(category)

    # Generate follow-up prompts for each extracted category
    for category in categories:
        follow_up_prompts.append(
            f"""
            Retrieve the exact legal requirements for '{category}' as outlined in the Pakistan Data Protection Law.
            Provide specific details, including definitions, obligations, and any associated penalties or compliance measures.
            """
        )

# Print the follow-up prompts
for i, prompt in enumerate(follow_up_prompts, start=1):
    print(f"Follow-up Prompt {i}:")
    print(prompt.strip())
    print("-" * 50)


Query: 
    Extract and list the compliance points that an application's privacy policy should address according to the Pakistan Data Protection Law. Focus on identifying distinct categories or principles. list these compliance points in terms of priority and importance.

    Provide the results in the following structured format:
    - [Category Name 1]: Brief explanation of the principle.
    - [Category Name 2]: Brief explanation of the principle.
    - [Category Name 3]: Brief explanation of the principle.

    Ensure the categories are clearly defined and separate, so they can be used for follow-up queries.
    
Response: - **Personal Data Collection and Use**: The application must inform data subjects at the time of data collection about the purpose for which their personal data is being collected and ensure that it is not used for any other purpose without consent.

- **Consent for Disclosure**: Personal data should not be disclosed without the explicit consent of the data subje

In [16]:
results = run_multiple_queries(query_engine, follow_up_prompts)

# Print results
for query, response in results:
    print(f"Query: {query}")
    print(f"Response: {response}")
    print("-" * 50)

Query: 
            Retrieve the exact legal requirements for '**Personal Data Collection and Use**' as outlined in the Pakistan Data Protection Law.
            Provide specific details, including definitions, obligations, and any associated penalties or compliance measures.
            
Response: The legal requirements for personal data collection and use as outlined in the Pakistan Data Protection Law include the following key points:

1. **Grounds for Processing Personal Data**:
   - Personal data must be collected, processed, and disclosed lawfully and fairly, in compliance with the provisions of the Act.
   - Data collection must be for specified, explicit, and legitimate purposes, and further processing must not be incompatible with these purposes. The data collected should be adequate, relevant, and limited to the purposes for which it is processed.

2. **Registration Obligations**:
   - Data controllers and data processors operating within Pakistan must register with the Commi

In [17]:
# Step 1: Collect all responses into a single report prompt
report_prompt = """
Using the following compliance details, generate a comprehensive report in the form of a questionnaire.
The report should include sections for each compliance category, with a list of detailed questions based on the provided responses.

Structure:
- Section Title: <Compliance Category Name>
- Brief Description: A short summary of the category based on the response.
- Questions:
  - A list of questions based on the detailed information in the response.

Responses:
"""

for query, response in results:
    report_prompt += f"""
Query: {query.strip()}
Response: {response.strip()}
"""
report_prompt += "\n"

# Step 2: Add final instructions for the LLM to structure the report
report_prompt += """
Ensure the generated report is clearly formatted and suitable for presenting to a privacy policy agent. Each section should be titled with the compliance category and should include a concise summary and a detailed list of questions.
"""

# Step 3: Pass the prompt to the LLM
llm_generated_report = query_engine.query(report_prompt)

# Step 4: Print the generated report
print("\nGenerated Report Questionnaire:")
print(llm_generated_report)



Generated Report Questionnaire:
# Compliance Report Questionnaire

## Section Title: Personal Data Collection and Use
### Brief Description:
This section outlines the legal requirements for the lawful and fair collection, processing, and disclosure of personal data, emphasizing the need for specified purposes and registration obligations for data controllers and processors.

### Questions:
1. What are the specified, explicit, and legitimate purposes for which personal data is collected?
2. How does the data controller ensure that the data collected is adequate, relevant, and limited to these purposes?
3. What is the process for registering with the Commission, and what information is required?
4. How is a "significant" data controller or processor defined, and what criteria are used to determine this status?
5. What measures are in place to ensure compliance with the Act for data controllers or processors operating outside Pakistan?

---

## Section Title: Consent for Disclosure
### B

In [18]:
# prompt: save the generate llm_generated_report into text

# Step 4: Save the generated report to a text file
with open("llm_generated_report.txt", "w") as file:
    file.write(str(llm_generated_report))

In [None]:
# # Step 1: Read Questions from the Text File
# questions_file_path = "llm_generated_report.txt"

# with open(questions_file_path, "r") as file:
#     questions = file.readlines()

# # Step 2: Prepare Questions for Querying RAG
# # Clean and filter out any empty lines or unnecessary content
# questions = [q.strip() for q in questions if q.strip()]  # Remove blank lines and strip whitespace

# # Debug: Print the list of questions to ensure correctness
# print("\nQuestions to be Queried:")
# for i, question in enumerate(questions, start=1):
#     print(f"{i}. {question}")
# print("-" * 50)

# # Step 3: Query Each Question Using Raptor RAG
# responses = []
# for question in questions:
#     response = query_engine.query(question)  # Query the question against the RAG system
#     responses.append((question, response.text.strip()))  # Collect the question and response

# # Step 4: Print the Results
# print("\nResults from Raptor RAG:")
# for i, (question, response) in enumerate(responses, start=1):
#     print(f"Question {i}: {question}")
#     print(f"Response {i}: {response}")
#     print("=" * 50)

# # Step 5: Save the Results to a Text File
# output_file_path = "data_protection_requirements.txt"

# with open(output_file_path, "w") as file:
#     for question, response in responses:
#         file.write(f"Question: {question}\n")
#         file.write(f"Response: {response}\n")
#         file.write("=" * 50 + "\n")

# print(f"\nResults have been saved to: {output_file_path}")


In [19]:
# Step 1: Read Questions from the Text File
questions_file_path = "llm_generated_report.txt"

with open(questions_file_path, "r") as file:
    questions = file.readlines()

# Step 2: Prepare Questions for Querying RAG
# Clean and filter out any empty lines or unnecessary content
questions = [q.strip() for q in questions if q.strip()]  # Remove blank lines and strip whitespace

# Debug: Print the list of questions to ensure correctness
print("\nQuestions to be Queried:")
for i, question in enumerate(questions, start=1):
    print(f"{i}. {question}")
print("-" * 50)

# Step 3: Query Each Question Using Raptor RAG
responses = []
for question in questions:
    response = query_engine.query(question)  # Query the question against the RAG system
    # Access the response content using the 'response' attribute instead of 'text'
    responses.append((question, response.response.strip()))  # Collect the question and response

# Step 4: Print the Results
print("\nResults from Raptor RAG:")
for i, (question, response) in enumerate(responses, start=1):
    print(f"Question {i}: {question}")
    print(f"Response {i}: {response}")
    print("=" * 50)

# Step 5: Save the Results to a Text File
output_file_path = "data_protection_requirements.txt"

with open(output_file_path, "w") as file:
    for question, response in responses:
        file.write(f"Question: {question}\n")
        file.write(f"Response: {response}\n")
        file.write("=" * 50 + "\n")

print(f"\nResults have been saved to: {output_file_path}")


Questions to be Queried:
1. # Compliance Report Questionnaire
2. ## Section Title: Personal Data Collection and Use
3. ### Brief Description:
4. This section outlines the legal requirements for the lawful and fair collection, processing, and disclosure of personal data, emphasizing the need for specified purposes and registration obligations for data controllers and processors.
5. ### Questions:
6. 1. What are the specified, explicit, and legitimate purposes for which personal data is collected?
7. 2. How does the data controller ensure that the data collected is adequate, relevant, and limited to these purposes?
8. 3. What is the process for registering with the Commission, and what information is required?
9. 4. How is a "significant" data controller or processor defined, and what criteria are used to determine this status?
10. 5. What measures are in place to ensure compliance with the Act for data controllers or processors operating outside Pakistan?
11. ---
12. ## Section Title: 

In [20]:
# Step 1: Query the RAG System (using previously defined steps)
responses = []
for question in questions:
    response = query_engine.query(question)  # Query each question against RAG
    # Access the response content using the 'response' attribute instead of 'text'
    responses.append((question, response.response.strip()))  # Store question and response

# Step 2: Generate a Prompt for the LLM to Create a Checklist
llm_prompt = """
Using the following detailed responses, generate a checklist that outlines the compliance requirements an application's privacy policy must meet under the Pakistan Data Protection Law.

The checklist should:
- Be concise and clear.
- Use bullet points or numbered items.
- Group related requirements together under appropriate headings (e.g., "Lawful Processing", "Purpose Limitation").

Responses:
"""
for question, response in responses:
    llm_prompt += f"""
Question: {question.strip()}
Response: {response.strip()}
"""
llm_prompt += "\n"

llm_prompt += """
Ensure the checklist is practical and actionable, suitable for developers or legal teams to ensure compliance.
"""

# Step 3: Pass the Prompt to the LLM
checklist = query_engine.query(llm_prompt)  # Use the LLM to generate the checklist

# Step 4: Save the Checklist to a File
checklist_file_path = "privacy_policy_checklist.txt"


In [21]:

with open(checklist_file_path, "w") as file:
    # Access the response content using the 'response' attribute
    file.write(checklist.response.strip())  # Adjust attribute if needed for your query_engine

print(f"Checklist has been saved to: {checklist_file_path}")

# Step 5: Print the Checklist
print("\nGenerated Checklist:")
# Access the response content using the 'response' attribute
print(checklist.response.strip())

Checklist has been saved to: privacy_policy_checklist.txt

Generated Checklist:
### Compliance Checklist for Privacy Policy under Pakistan Data Protection Law

#### 1. **Lawful Processing**
- Obtain explicit consent from data subjects before collecting or processing personal data.
- Ensure that personal data is collected for specified, explicit, and legitimate purposes.
- Maintain records of all applications, notices, and requests related to personal data processing.

#### 2. **Purpose Limitation**
- Clearly define and communicate the purposes for which personal data is collected.
- Ensure that personal data is not processed in a manner incompatible with the stated purposes.

#### 3. **Data Subject Rights**
- Inform data subjects of their rights, including the right to access, correction, and erasure of their personal data.
- Provide a mechanism for data subjects to request access to their personal data and receive a copy in an intelligible form.

#### 4. **Data Security Measures**
- I

In [22]:
# Step 1: Query the RAG System (using previously defined steps)
responses = []
for question in questions:
    response = query_engine.query(question)  # Query each question against RAG
    # Access the response content using the 'response' attribute instead of 'text'
    responses.append((question, response.response.strip()))  # Store question and response

# Step 2: Generate a Prompt for the LLM to Create a Checklist
llm_prompt = """
Using the following detailed responses, generate a checklist that outlines the compliance requirements an application's privacy policy must meet under the Pakistan Data Protection Law.

The checklist should:
- Be concise and clear.
- Use bullet points or numbered items.
- Group related requirements together under appropriate headings (e.g., "Lawful Processing", "Purpose Limitation").

Responses:
"""
for question, response in responses:
    llm_prompt += f"""
Question: {question.strip()}
Response: {response.strip()}
"""
llm_prompt += "\n"

llm_prompt += """
Ensure the checklist is practical and actionable, suitable for developers or legal teams to ensure compliance.
"""

# Step 3: Pass the Prompt to the LLM
checklist = query_engine.query(llm_prompt)  # Use the LLM to generate the checklist

# Step 4: Save the Checklist to a File
checklist_file_path = "privacy_policy_checklist.txt"

# Ensure the file is created and content is written before proceeding to the next step
with open(checklist_file_path, "w") as file:
    # Access the response content using the 'response' attribute
    file.write(checklist.response.strip())  # Adjust attribute if needed for your query_engine

print(f"Checklist has been saved to: {checklist_file_path}")

# Step 5: Print the Checklist
print("\nGenerated Checklist:")
# Access the response content using the 'response' attribute
print(checklist.response.strip())

Checklist has been saved to: privacy_policy_checklist.txt

Generated Checklist:
### Compliance Checklist for Privacy Policy under Pakistan Data Protection Law

#### 1. **Lawful Processing**
- Obtain explicit consent from data subjects before collecting or processing personal data.
- Ensure that personal data is collected for specified, explicit, and legitimate purposes.
- Maintain records of all applications, notices, and requests related to personal data processing.

#### 2. **Purpose Limitation**
- Clearly define and communicate the purposes for which personal data is collected.
- Limit data collection to what is adequate, relevant, and necessary for the stated purposes.

#### 3. **Data Subject Rights**
- Inform data subjects of their rights, including the right to access, correction, and erasure of their personal data.
- Provide a mechanism for data subjects to request access to their personal data and to correct inaccuracies.

#### 4. **Consent Management**
- Implement processes fo