In [None]:
%pip install chromadb langchain pdfplumber sentence-transformers requests python-dotenv


In [6]:
%pip uninstall transformers tensorflow


^C
Note: you may need to restart the kernel to use updated packages.


In [20]:
import os
import torch
import numpy as np
import fitz  # PyMuPDF
import faiss
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
import requests
import re

# Load environment variables
load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "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")

# Initialize embedding models - only load what we need
print("Loading embedding models...")
multilingual_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
print("Embedding models loaded.")

# Document processing functions
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF using PyMuPDF (fitz)"""
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text("text") for page in doc])
        return text
    except Exception as e:
        print(f"Error extracting text from PDF {pdf_path}: {e}")
        return ""

def load_documents(folder_path):
    """Load documents from a folder"""
    docs = []
    
    # Check if folder exists
    if not os.path.exists(folder_path):
        print(f"Warning: Folder {folder_path} does not exist!")
        return docs
    
    print(f"Looking for PDF files in {folder_path}...")
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith(".pdf"):
            print(f"Processing PDF: {filename}")
            text = extract_text_from_pdf(file_path)
            if text:
                docs.append(Document(page_content=text, metadata={"source": filename}))
        
    print(f"Loaded {len(docs)} documents")
    return docs

def split_documents(documents, chunk_size=500, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    split_docs = []
    for doc in documents:
        chunks = text_splitter.split_text(doc.page_content)
        for chunk in chunks:
            split_docs.append(Document(page_content=chunk, metadata=doc.metadata))
    return split_docs

# Build vector storage with hybrid retrieval
import os

class HybridRetriever:
    def __init__(self, documents=None, use_faiss=True, faiss_index_path="faiss_index.bin", chroma_path="chroma_store"):
        self.faiss_index_path = faiss_index_path
        self.chroma_path = chroma_path

        if documents:
            self.documents = documents
            self.document_texts = [doc.page_content for doc in documents]
            self.document_sources = [doc.metadata for doc in documents]

            print("Building BM25 index...")
            self.tokenized_corpus = [text.lower().split() for text in self.document_texts]
            self.bm25 = BM25Okapi(self.tokenized_corpus)

            print("Encoding document embeddings...")
            self.embeddings = multilingual_model.encode(self.document_texts, convert_to_numpy=True)

            if use_faiss:
                self._build_faiss_index()

            print("Building Chroma vectorstore...")
            self._build_chroma_vectorstore()
        else:
            print("Loading existing indices...")
            self.documents = []
            self.document_texts = []
            self.document_sources = []
            self.bm25 = None
            self.embeddings = None
            self.index = None
            self.chroma = None

            if os.path.exists(self.faiss_index_path):
                self._load_faiss_index()

            if os.path.exists(self.chroma_path):
                self._load_chroma_vectorstore()

    def _build_faiss_index(self):
        print("Building FAISS index...")
        d = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(d)
        self.index.add(self.embeddings)
        faiss.write_index(self.index, self.faiss_index_path)
        print(f"FAISS index saved to {self.faiss_index_path}")

    def _load_faiss_index(self):
        print(f"Loading FAISS index from {self.faiss_index_path}...")
        self.index = faiss.read_index(self.faiss_index_path)
        print("FAISS index loaded.")

    def _build_chroma_vectorstore(self):
        embedding_function = HuggingFaceEmbeddings(
            model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
        )
        self.chroma = Chroma.from_documents(documents=self.documents, embedding=embedding_function, persist_directory=self.chroma_path)
        self.chroma.persist()
        print(f"Chroma vectorstore saved to {self.chroma_path}")

    def _load_chroma_vectorstore(self):
        print(f"Loading Chroma vectorstore from {self.chroma_path}...")
        embedding_function = HuggingFaceEmbeddings(
            model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
        )
        self.chroma = Chroma(persist_directory=self.chroma_path, embedding=embedding_function)
        print("Chroma vectorstore loaded.")

    def search(self, query, top_k=5, hybrid_weight=0.7):
        if not self.documents:
            return []

        query_embedding = multilingual_model.encode([query])[0]

        # BM25 retrieval
        tokenized_query = query.lower().split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        bm25_scores = bm25_scores / (np.max(bm25_scores) + 1e-8)  # Normalize

        # Semantic retrieval
        semantic_scores = cosine_similarity([query_embedding], self.embeddings)[0]

        # Combine scores
        combined_scores = hybrid_weight * semantic_scores + (1 - hybrid_weight) * bm25_scores

        # Get top-k indices and scores
        top_indices = np.argsort(combined_scores)[-top_k:][::-1]

        results = []
        for idx in top_indices:
            results.append({
                "content": self.document_texts[idx],
                "metadata": self.document_sources[idx],
                "score": float(combined_scores[idx])
            })

        # Additional retrieval from Chroma
        chroma_results = self.chroma.similarity_search_with_score(query, k=top_k)
        chroma_docs = [{"content": doc.page_content, "metadata": doc.metadata, "score": score}
                       for doc, score in chroma_results]

        # Merge results
        all_contents = set([r["content"] for r in results])
        for doc in chroma_docs:
            if doc["content"] not in all_contents:
                results.append(doc)
                all_contents.add(doc["content"])

        results = results[:top_k]
        return results

def query_openrouter_llm(context, question, language="fr"):
    """Query LLM through OpenRouter API"""
    if language == "fr":
        prompt = f"""
        En vous basant sur les extraits juridiques suivants, répondez à la question en français.
        Si les extraits ne contiennent pas d'information pertinente pour répondre, indiquez-le clairement.
        
        Extraits juridiques:
        {context}
        
        Question: {question}
        """
        
        model = "deepseek/deepseek-chat-v3-0324"
    else:
        prompt = f"""
        Based on the following legal extracts, answer the question in English.
        If the extracts don't contain relevant information to answer, clearly state this.
        
        Legal extracts:
        {context}
        
        Question: {question}
        """
        
        model = "anthropic/claude-3-sonnet-20240229"
    
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json"
    }
    
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}]
    }
    
    response = requests.post("https://openrouter.ai/api/v1/chat/completions", 
                          headers=headers, json=data)
    
    if response.status_code != 200:
        raise Exception(f"API Error: {response.status_code} - {response.text}")
    
    res_json = response.json()
    if "choices" not in res_json:
        raise Exception(f"Unexpected API response: {res_json}")
    
    return res_json["choices"][0]["message"]["content"].strip()

def query_rag(question, retriever, top_k=5, language="fr"):
    """Run RAG pipeline with improved retrieval"""
    # Get document chunks relevant to the question
    retrieved_docs = retriever.search(question, top_k=top_k)
    
    if not retrieved_docs:
        if language == "fr":
            return "Aucun document pertinent trouvé pour cette question.", []
        else:
            return "No relevant documents found for this question.", []
    
    # Create context from retrieved documents
    context = "\n\n---\n\n".join([doc["content"][:1500] for doc in retrieved_docs])
    print(context)
    try:
        # Query LLM with context and question
        answer = query_openrouter_llm(context, question, language)
        
        # Extract sources for citation
        sources = [doc["metadata"] for doc in retrieved_docs]
        
        return answer, sources
    
    except Exception as e:
        print(f"Error querying LLM: {e}")
        if language == "fr":
            return f"Erreur lors de la génération de la réponse: {str(e)}", []
        else:
            return f"Error generating response: {str(e)}", []

# This class helps integrate the RAG system into a Jupyter notebook
class LegalRAGSystem:
    def __init__(self, documents_folder="documents"):
        self.documents_folder = documents_folder
        self.retriever = None
        self.loaded = False
    
    def load_documents(self, folder_path=None):
        """Load documents from specified folder or use default"""
        if folder_path:
            self.documents_folder = folder_path
        
        print(f"Loading documents from {self.documents_folder}")
        documents = load_documents(self.documents_folder)
        
        if documents:
            chunked_docs = split_documents(documents)
            self.retriever = HybridRetriever(chunked_docs)
            self.loaded = True
            return f"Loaded {len(documents)} documents, created {len(chunked_docs)} chunks"
        else:
            print("No documents found. Please check the path and try again.")
            return "No documents loaded"
    
    def query(self, question, language="fr", top_k=5):
        """Query the system with a question"""
        if not self.loaded:
            return "System not loaded. Please load documents first."
        
        answer, sources = query_rag(question, self.retriever, top_k, language)
        
        # Format sources for display
        source_info = []
        for src in sources:
            source_name = src.get('source', 'Unknown')
            chunk_num = src.get('chunk', 'Unknown')
            total_chunks = src.get('chunk_of', 'Unknown')
            source_info.append(f"{source_name} (chunk {chunk_num}/{total_chunks})")
        
        return {
            "answer": answer,
            "sources": source_info
        }

Loading embedding models...
Embedding models loaded.


In [21]:
# First execute the code above

# Initialize the system
rag_system = LegalRAGSystem()

# Load documents from a specific folder

document = Document(page_content=extract_text_from_pdf(r"C:\Users\mouni\tun_law_project\documents\Constitution_fr.pdf"), metadata={"source": "Constitution_fr.pdf"})
chunked_docs = split_documents([document])
rag_system.retriever = HybridRetriever(chunked_docs)
rag_system.loaded = True

# Query the system
result = rag_system.query("citez l'article 20")

# Display the result
print("Answer:")
print(result["answer"])
print("\nSources:")
for source in result["sources"]:
    print(f"- {source}")

Building BM25 index...
Encoding document embeddings...
Building FAISS index...
FAISS index saved to faiss_index.bin
Building Chroma vectorstore...
Chroma vectorstore saved to chroma_store
constituante. 
Demeurent en vigueur, jusqu’à l’élection du Président de la 
République conformément aux dispositions de l’article 74 et suivants 
de la Constitution, les dispositions des articles 7, 9 à 14 et de l’article 
26 de l’Organisation provisoire des pouvoirs publics.  
Demeurent en vigueur, jusqu’à ce que le premier Gouvernement 
obtienne la confiance de l’Assemblée des représentants du peuple, les 
articles 17 à 20 de l’Organisation
)
1
(  provisoire des pouvoirs publics.

---

9
1 à 20
Chapitre I : Des principes généraux……..…..…...
13
21 à 49
Chapitre II : Des droits et libertés……….......…...
19
50 à 70
Chapitre III : Du pouvoir législatif……….……... 
27
71 à 101
Chapitre IV : Du pouvoir exécutif ………..……...
27
72 à 88
Section I : Du Président de la République………... 
34
89 à 101
Section II : D

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 427)