# 🏦 Bank Information System RAG Chatbot (Weaviate + Azure OpenAI, LangChain)

This notebook demonstrates a full Retrieval Augmented Generation (RAG) workflow tailored for banking information system documentation, using **Weaviate** as the vector database and **Azure OpenAI** for LLM/embeddings, with French-friendly prompts.

In [ ]:
# -- 1. Install required packages (run only once)
!pip install python-dotenv langchain langchain-openai langchain-community weaviate-client[embedded] unstructured[all-docs] pypdf sentence-transformers rank_bm25

## 2. Environment & Config
Set up your secrets!
- Create a `.env` file in your project with your Azure OpenAI and Weaviate credentials.

In [ ]:
import os
from dotenv import load_dotenv

load_dotenv()
REQUIRED_VARS = [
    'AZURE_OPENAI_API_KEY',
    'AZURE_OPENAI_ENDPOINT',
    'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME',
    'AZURE_OPENAI_DEPLOYMENT_NAME'
]

for var in REQUIRED_VARS:
    if not os.getenv(var):
        raise EnvironmentError(f"Missing env variable: {var}")
# Optionally set Weaviate config if connecting to a remote instance. For embedded test, skip.

## 3. Document Ingestion (PDF/CSV/XLSX)
Load your files from structured folders.

In [ ]:
from langchain_community.document_loaders import PyPDFDirectoryLoader, CSVLoader, UnstructuredExcelLoader
import glob

DOCS_DIR = "documents"

def load_documents():
    documents = []
    source_metadata = {}
    
    # PDFs from ./documents/pdfs
    if os.path.exists(os.path.join(DOCS_DIR, "pdfs")):
        for file in glob.glob(os.path.join(DOCS_DIR, "pdfs", "*.pdf")):
            try:
                loader = PyPDFDirectoryLoader(os.path.dirname(file))
                docs = loader.load()
                file_name = os.path.basename(file)
                
                # Add source metadata to each document
                for doc in docs:
                    doc.metadata["source_file"] = file_name
                    doc.metadata["source_type"] = "pdf"
                
                documents.extend(docs)
                print(f"Loaded {file_name}")
            except Exception as e:
                print(f"Error loading {file}: {e}")

    # CSVs from ./documents/csv
    csv_dir = os.path.join(DOCS_DIR, "csv")
    if os.path.exists(csv_dir):
        for file in glob.glob(os.path.join(csv_dir, '*.csv')):
            try:
                docs = CSVLoader(file_path=file).load()
                file_name = os.path.basename(file)
                
                for doc in docs:
                    doc.metadata["source_file"] = file_name
                    doc.metadata["source_type"] = "csv"
                    
                documents.extend(docs)
                print(f"Loaded {file_name}")
            except Exception as e:
                print(f"Error loading {file}: {e}")

    # Excels from ./documents/excel
    xls_dir = os.path.join(DOCS_DIR, "excel")
    if os.path.exists(xls_dir):
        for file in glob.glob(os.path.join(xls_dir, '*.xls*')):
            try:
                docs = UnstructuredExcelLoader(file_path=file).load()
                file_name = os.path.basename(file)
                
                for doc in docs:
                    doc.metadata["source_file"] = file_name
                    doc.metadata["source_type"] = "excel"
                    
                documents.extend(docs)
                print(f"Loaded {file_name}")
            except Exception as e:
                print(f"Error loading {file}: {e}")

    print(f"Loaded {len(documents)} documents in total")
    return documents

docs = load_documents()

## 4. Semantic Text Splitting

In [ ]:
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np

# Function for semantic text splitting
def split_documents_semantic(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into semantically coherent chunks"""
    # First get basic chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", "\. ", " ", ""]
    )
    chunks = text_splitter.split_documents(documents)
    
    # Re-group semantically similar chunks
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    try:
        model = SentenceTransformer(model_name)
        print(f"Loaded semantic model: {model_name}")
    except Exception as e:
        print(f"Error loading semantic model: {e}")
        # Fall back to regular chunking
        return chunks
    
    # Group similar chunks using embeddings similarity
    grouped_chunks = []
    current_group = []
    last_embedding = None
    similarity_threshold = 0.75
    
    for chunk in chunks:
        # Skip empty chunks
        if not chunk.page_content.strip():
            continue
            
        # Get embedding for this chunk
        current_embedding = model.encode(chunk.page_content)
        
        # Start a new group if we don't have one yet
        if not current_group:
            current_group.append(chunk)
            last_embedding = current_embedding
            continue
            
        # Calculate similarity with the previous chunk
        similarity = np.dot(current_embedding, last_embedding) / (np.linalg.norm(current_embedding) * np.linalg.norm(last_embedding))
        
        # If similar enough and combined length not too large, add to current group
        combined_length = sum(len(c.page_content) for c in current_group) + len(chunk.page_content)
        if similarity > similarity_threshold and combined_length < chunk_size * 1.5:
            current_group.append(chunk)
            # Update last embedding as average of group
            group_embeddings = model.encode([c.page_content for c in current_group])
            last_embedding = np.mean(group_embeddings, axis=0)
        else:
            # Combine current group into a single document and add to results
            if current_group:
                combined_text = " ".join([c.page_content for c in current_group])
                combined_metadata = current_group[0].metadata.copy()
                combined_metadata["chunk_sources"] = [c.metadata.get("source_file", "unknown") for c in current_group]
                grouped_chunks.append(Document(page_content=combined_text, metadata=combined_metadata))
                
            # Start a new group with this chunk
            current_group = [chunk]
            last_embedding = current_embedding
    
    # Add the last group if it exists
    if current_group:
        combined_text = " ".join([c.page_content for c in current_group])
        combined_metadata = current_group[0].metadata.copy()
        combined_metadata["chunk_sources"] = [c.metadata.get("source_file", "unknown") for c in current_group]
        grouped_chunks.append(Document(page_content=combined_text, metadata=combined_metadata))
    
    print(f"Initial chunking: {len(chunks)} chunks")
    print(f"After semantic grouping: {len(grouped_chunks)} chunks")
    return grouped_chunks

# Apply the semantic splitter
chunks = split_documents_semantic(docs)
print(f"Split into {len(chunks)} final chunks.")

## 5. Weaviate Vector Store Setup + Embeddings (Azure OpenAI)

In [ ]:
# We use embedded Weaviate for testing. If you want remote, set the proper config env.
import weaviate
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import Weaviate as LCWeaviate

WEAVIATE_CLASS = "BankDocChunk"
BANK_DB_STATE_PATH = "bank_rag_state.json"

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    openai_api_version="2023-05-15"
)

# Create embedded Weaviate client
client = weaviate.Client("embedded")

# Clean up any previous test schema/classes
if client.schema.exists(WEAVIATE_CLASS):
    client.schema.delete_class(WEAVIATE_CLASS)

# Add a class (collection) for chunks with metadata fields
client.schema.create_class({
    "class": WEAVIATE_CLASS,
    "vectorizer": "none",
    "properties": [
        {"name": "text", "dataType": ["text"]},
        {"name": "source_file", "dataType": ["string"]},
        {"name": "source_type", "dataType": ["string"]},
        {"name": "page", "dataType": ["int"]},
    ],
})

print(f"Weaviate schema: {[x['class'] for x in client.schema.get()['classes']]}")

# Use LangChain's Weaviate wrapper for RAG
vectorstore = LCWeaviate.from_documents(
    documents=chunks,
    embedding=embeddings,
    client=client,
    by_text=True,
    text_key="text",
    class_name=WEAVIATE_CLASS,
)

print(f"Inserted {vectorstore._client.query.aggregate(WEAVIATE_CLASS).with_meta_count().do()['data']['Aggregate'][WEAVIATE_CLASS][0]['meta']['count']} vector chunks.")

## 6. Free Custom Re-ranker Using BM25 Instead of Cohere

In [ ]:
from rank_bm25 import BM25Okapi
from typing import List, Dict, Any
import json
import pickle
from langchain.retrievers.document_compressors import DocumentCompressorPipeline

class BM25Reranker:
    """A document reranker using BM25 algorithm"""
    
    def __init__(self, k: int = 4):
        """Initialize with the number of documents to return"""
        self.k = k
        self.tokenized_corpus = None
        self.bm25 = None
        self.document_map = {}
    
    def fit(self, documents):
        """Fit the BM25 model on the corpus"""
        tokenized_corpus = [doc.page_content.lower().split() for doc in documents]
        self.document_map = {i: doc for i, doc in enumerate(documents)}
        self.tokenized_corpus = tokenized_corpus
        self.bm25 = BM25Okapi(tokenized_corpus)
        return self
    
    def rerank(self, query: str, documents):
        """Rerank documents based on BM25 scores"""
        if not self.bm25:
            self.fit(documents)
        
        # Tokenize query
        tokenized_query = query.lower().split()
        
        # Create a map from content to original document to handle duplicates
        doc_lookup = {}
        for i, doc in enumerate(documents):
            doc_lookup[doc.page_content] = i
        
        # Get BM25 scores
        tokenized_docs = [doc.page_content.lower().split() for doc in documents]
        temp_bm25 = BM25Okapi(tokenized_docs)
        scores = temp_bm25.get_scores(tokenized_query)
        
        # Sort documents by score
        scored_docs = [(score, documents[i]) for i, score in enumerate(scores)]
        sorted_docs = sorted(scored_docs, key=lambda x: x[0], reverse=True)
        
        # Return top k documents
        return [doc for _, doc in sorted_docs[:self.k]]
    
class CustomRetrieverWithHistory:
    """A custom retriever that uses BM25 reranking and leverages query history"""
    
    def __init__(self, vectorstore, chat_history=None, k=10, rerank_k=4):
        self.vectorstore = vectorstore
        self.chat_history = chat_history or []
        self.reranker = BM25Reranker(k=rerank_k) 
        self.k = k
    
    def get_relevant_documents(self, query):
        # First, get documents from vector store
        docs = self.vectorstore.similarity_search(query, k=self.k)
        
        # If we have chat history, use it to enhance our retrieval
        if self.chat_history and len(self.chat_history) > 0:
            # Get the last few exchanges to provide context
            recent_history = self.chat_history[-3:] if len(self.chat_history) > 3 else self.chat_history
            history_context = "\n".join([f"Q: {q}\nA: {a}" for q, a in recent_history])
            
            # Combine with current query for broader retrieval
            enhanced_query = f"{query} {history_context}"
            additional_docs = self.vectorstore.similarity_search(enhanced_query, k=5)
            
            # Combine both sets of documents (removing duplicates)
            unique_docs = {}
            for doc in docs + additional_docs:
                if doc.page_content not in unique_docs:
                    unique_docs[doc.page_content] = doc
            
            docs = list(unique_docs.values())
        
        # Apply reranking to final set
        return self.reranker.rerank(query, docs)

    def update_history(self, query, answer):
        """Update chat history with new exchange"""
        self.chat_history.append((query, answer))
    
    def save_history(self, file_path=BANK_DB_STATE_PATH):
        """Save chat history to a file"""
        state = {
            "chat_history": self.chat_history,
            "timestamp": str(import datetime; datetime.datetime.now())
        }
        with open(file_path, 'w') as f:
            json.dump(state, f)
            
    def load_history(self, file_path=BANK_DB_STATE_PATH):
        """Load chat history from a file"""
        try:
            with open(file_path, 'r') as f:
                state = json.load(f)
                self.chat_history = state.get("chat_history", [])
                print(f"Loaded {len(self.chat_history)} previous conversations")
        except FileNotFoundError:
            print("No previous history found.")
        except Exception as e:
            print(f"Error loading history: {e}")

# Initialize our custom retriever
retriever = CustomRetrieverWithHistory(vectorstore=vectorstore, k=10, rerank_k=4)

# Load previous chat history if available
retriever.load_history()

## 7. Setup LLM, Prompts and Chain (French, RAG)

In [ ]:
from langchain_openai import AzureChatOpenAI
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.prompts import PromptTemplate

# Setup LLM
llm = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    temperature=0,
)

RAG_TEMPLATE = """Vous êtes un assistant expert en documentation de base de données bancaire.
Votre objectif est de fournir des informations précises et spécifiques sur l'emplacement des données dans le système de base de données de la banque.

Informations contextuelles issues de la documentation de la base de données ci-dessous:
---------------------
{context}
---------------------

Compte tenu des informations contextuelles et de la question, fournissez une réponse détaillée qui indique clairement:
1. La ou les table(s) exacte(s) où se trouvent les données demandées
2. Les noms spécifiques des colonnes
3. Les relations avec d'autres tables
4. Les types de données et les contraintes si pertinent

Important: À la fin de votre réponse, indiquez clairement les sources d'où proviennent ces informations en listant les noms des fichiers PDF sources.

Soyez précis et technique. Si les informations ne peuvent pas être trouvées dans le contexte, reconnaissez-le clairement.
N'inventez pas d'informations sur les noms de tables ou les structures.

Historique de la conversation: {chat_history}
Question: {question}
"""
PROMPT = PromptTemplate(
    template=RAG_TEMPLATE,
    input_variables=["context", "chat_history", "question"]
)

rag_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": PROMPT},
)


## 8. Query Reformulation, Explanation, Commentary Functions (French)

In [ ]:
def generate_alternative_queries(query: str) -> list:
    prompt = PromptTemplate(
        template="""Génère 3 façons alternatives de poser la question suivante à propos d'une base de données bancaire.\nRends les alternatives spécifiques et concentrées sur la structure de la base de données et l'emplacement des données.\nQuestion originale: {question}\n\nRetourne uniquement les questions, une par ligne.""",
        input_variables=["question"]
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    result = chain.run(question=query)
    alts = [q.strip() for q in result.split('\n') if q.strip()]
    return alts + [query]

def generate_answer_explanation(query, answer, sources):
    # Use only up to first 2 source texts for summary
    sources_summary = "\n".join([doc.page_content[:300] for doc in sources[:2]])
    prompt = PromptTemplate(
        template="""En tant qu'expert en base de données bancaire, explique comment tu as formulé la réponse suivante à la question posée.\nFais référence aux données spécifiques et à la structure de la base de données bancaire.\n\nQuestion: {query}\nRéponse: {answer}\n\nSources principales:\n{sources}\n\nExplique ton raisonnement en français en te concentrant sur:\n1. Comment les sources ont influencé ta réponse\n2. Les éléments clés de la structure de la base de données mentionnés\n3. Les relations entre les tables identifiées\n4. Toute implication ou considération technique importante\n\nExplication (en 3-5 phrases):""",
        input_variables=["query", "answer", "sources"]
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    explanation = chain.run(query=query, answer=answer, sources=sources_summary)
    return explanation.strip()

def generate_technical_commentary(query, answer):
    prompt = PromptTemplate(
        template="""En tant qu'expert en bases de données bancaires, fournis un commentaire technique bref\nsur les aspects de la structure de la base de données mentionnés dans cette question et réponse.\n\nQuestion: {query}\nRéponse: {answer}\n\nFournis un commentaire technique (2-3 phrases) qui pourrait aider un développeur ou un analyste de données\nà mieux comprendre les implications techniques de cette information:""",
        input_variables=["query", "answer"]
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    return chain.run(query=query, answer=answer).strip()

## 9. Evaluation Functions & User Feedback

In [ ]:
def evaluate_rag_response(query, answer, retrieved_docs, feedback=None):
    """
    Evaluate the quality of a RAG response based on various metrics
    """
    evaluation = {}
    
    # 1. Document relevance score - Compare query with retrieved docs
    query_tokens = set(query.lower().split())
    relevant_docs = 0
    
    for doc in retrieved_docs:
        doc_tokens = set(doc.page_content.lower().split())
        overlap = len(query_tokens.intersection(doc_tokens)) / len(query_tokens) if query_tokens else 0
        if overlap > 0.2:  # Simple relevance threshold
            relevant_docs += 1
    
    doc_relevance = relevant_docs / len(retrieved_docs) if retrieved_docs else 0
    evaluation["document_relevance"] = round(doc_relevance * 10, 2)  # Score out of 10
    
    # 2. Source diversity - Are we pulling from different sources?
    sources = set()
    for doc in retrieved_docs:
        if "source_file" in doc.metadata:
            sources.add(doc.metadata["source_file"])
    
    source_diversity = min(len(sources) / 3, 1.0)  # Normalize to max of 1
    evaluation["source_diversity"] = round(source_diversity * 10, 2)  # Score out of 10
    
    # 3. Answer relevance - Analyze how well the answer matches the query
    # This requires LLM evaluation, creating a simplified heuristic
    answer_tokens = set(answer.lower().split())
    query_answer_overlap = len(query_tokens.intersection(answer_tokens)) / len(query_tokens) if query_tokens else 0
    evaluation["answer_query_alignment"] = round(query_answer_overlap * 10, 2)  # Score out of 10
    
    # 4. User feedback score if available
    if feedback is not None:
        evaluation["user_feedback"] = feedback  # User score (1-5)
    
    # 5. Hallucination risk - Check if answer cites sources properly
    contains_source_citation = any([
        "source" in answer.lower(),
        "pdf" in answer.lower(),
        "provien" in answer.lower(),
        "extract" in answer.lower()
    ])
    evaluation["hallucination_risk"] = "Low" if contains_source_citation else "Medium"
    
    # Overall score (weighted average)
    weights = {"document_relevance": 0.4, "source_diversity": 0.3, "answer_query_alignment": 0.3}
    overall_score = sum(evaluation[k] * weights[k] for k in weights.keys())
    evaluation["overall_score"] = round(overall_score, 2)  # Score out of 10
    
    return evaluation

class UserFeedbackTracker:
    """Track user feedback for responses"""
    
    def __init__(self, feedback_file="user_feedback.json"):
        self.feedback_file = feedback_file
        self.feedback_history = self._load_feedback()
    
    def _load_feedback(self):
        """Load feedback history from file"""
        try:
            with open(self.feedback_file, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            return []
        except Exception as e:
            print(f"Error loading feedback: {e}")
            return []
    
    def save_feedback(self, query, answer, rating, comment=None):
        """Save user feedback"""
        feedback_entry = {
            "timestamp": str(import datetime; datetime.datetime.now()),
            "query": query,
            "answer": answer[:100] + "...",  # Store shortened answer
            "rating": rating,
            "comment": comment
        }
        
        self.feedback_history.append(feedback_entry)
        
        with open(self.feedback_file, 'w') as f:
            json.dump(self.feedback_history, f)
        
        print(f"Feedback saved. Current rating: {rating}/5")
        return rating
    
    def get_average_rating(self):
        """Get average feedback rating"""
        if not self.feedback_history:
            return None
        
        ratings = [entry["rating"] for entry in self.feedback_history if "rating" in entry]
        return sum(ratings) / len(ratings) if ratings else None

# Initialize feedback tracker
feedback_tracker = UserFeedbackTracker()

## 10. Process a Query Through the Full Chain


In [ ]:
def extract_source_files(documents):
    """Extract source files from document metadata"""
    sources = []
    for doc in documents:
        if "source_file" in doc.metadata:
            sources.append(doc.metadata["source_file"])
        elif "source" in doc.metadata:
            sources.append(doc.metadata["source"])
    return list(set(sources))

def process_query(query: str):
    # Generate alternative phrasings of the query
    alt_queries = generate_alternative_queries(query)
    all_results = []
    
    # Process each alternative query
    for q in alt_queries:
        res = rag_chain({"question": q, "chat_history": retriever.chat_history})
        all_results.append(res)
    
    # Pick result with most source docs
    best_result = max(all_results, key=lambda x: len(x['source_documents']))
    answer, sources = best_result['answer'], best_result['source_documents']
    
    # Generate additional context
    explanation = generate_answer_explanation(query, answer, sources)
    commentary = generate_technical_commentary(query, answer)
    evaluation = evaluate_rag_response(query, answer, sources)
    
    # Add source files information if not already in the answer
    source_files = extract_source_files(sources)
    if source_files and "Sources:" not in answer:
        answer += f"\n\nSources: {', '.join(source_files)}"
    
    # Update chat history
    retriever.update_history(query, answer)
    retriever.save_history()
    
    return {
        'answer': answer,
        'sources': sources,
        'source_files': source_files,
        'explanation': explanation,
        'commentary': commentary,
        'evaluation': evaluation
    }

def submit_feedback(query, answer, rating, comment=None):
    """Submit user feedback for a response"""
    return feedback_tracker.save_feedback(query, answer, rating, comment)

# Usage example:
user_query = "Dans quelle table puis-je trouver les informations sur les transactions clients ?"
result = process_query(user_query)

print('Réponse:', result['answer'])
print('\nExplication:', result['explanation'])
print('\nCommentaire technique:', result['commentary'])
print('\nÉvaluation:', result['evaluation'])
print('\nSources des documents:', result['source_files'])

# Example of submitting feedback
# submit_feedback(user_query, result['answer'], 5, "Excellente réponse!")