<a href="https://colab.research.google.com/github/pk2971/computational-gender-analysis/blob/main/notebooks/Parliamentary_Debates_QA_with_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install -U langchain-community
%pip install -U sentence-transformers
%pip install chromadb
%pip install gradio

In [None]:
import zipfile
import re
from typing import Union, List
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers.cross_encoder import CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from langchain.vectorstores import Chroma
import gradio as gr

In [None]:
def extract_text_from_xml(xml_content: str) -> str:
    soup = BeautifulSoup(xml_content, "lxml-xml")
    return soup.get_text(separator=" ", strip=True)

def load_and_split_xml_from_zip(
    zip_path: str,
    years: Union[int, List[int]],
    chunk_size: int = 1000,
    chunk_overlap: int = 200
):
    if isinstance(years, int):
        years = [years]
    year_pattern = '|'.join(str(y) for y in years)

    with zipfile.ZipFile(zip_path, "r") as zip_file:
        matched_files = [
            f for f in zip_file.namelist()
            if re.search(rf'debates({year_pattern})-\d{{2}}-\d{{2}}a\.xml', f)
        ]

        all_texts = []
        for filename in matched_files:
            with zip_file.open(filename) as file:
                xml_content = file.read().decode("utf-8", errors="ignore")
                text = extract_text_from_xml(xml_content)
                all_texts.append(text)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.create_documents(all_texts)
    return docs


In [None]:
# Change year of documents to load here.
zip_file_path = "/content/drive/MyDrive/debates.zip"
years = 1928
docs = load_and_split_xml_from_zip(zip_file_path, years)


In [None]:
embedding_model = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)


In [None]:
vectorstore = Chroma.from_documents(docs, embeddings)

reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
access_token = "HF token"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=access_token,
    device_map="auto"
)


In [None]:
# Returns the answer with context from the Debates for cross referencing
def gradio_rag_bot_context(query):
    # Retrieve and rerank
    retrieved_docs = vectorstore.similarity_search(query, k=15)
    texts = [doc.page_content for doc in retrieved_docs]
    pairs = [[query, t] for t in texts]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(texts, scores), key=lambda x: x[1], reverse=True)
    top_contexts = [t for t, s in ranked[:5]]
    context = "\n".join(top_contexts)
    # Improved prompt
    prompt = (
    "You are a historian assistant. Based on the context below, answer the question.\n"
    "Context:\n"
    f"{context}\n\n"
    f"Question: {query}\n"
    "Answer (as bullet points):"
)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=256)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    # Show context for debugging
    return f"ANSWER:\n{answer}"

iface = gr.Interface(
    fn=gradio_rag_bot_context,
    inputs=gr.Textbox(lines=2, label="Ask a question about the debates"),
    outputs=gr.Textbox(lines=20, label="Debug Output (Context + Answer)"),
    title="Debates RAG Chatbot (Debug Mode)",
    description="See the context the model is using. If the answer is off, the context is likely not relevant."
)


In [None]:
def gradio_rag_bot(query):
    # Retrieve and rerank
    retrieved_docs = vectorstore.similarity_search(query, k=15)
    texts = [doc.page_content for doc in retrieved_docs]
    pairs = [[query, t] for t in texts]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(texts, scores), key=lambda x: x[1], reverse=True)
    top_contexts = [t for t, s in ranked[:5]]
    context = "\n".join(top_contexts)

    # Build improved prompt
    prompt = (
        "You are a historian assistant. Based on the context below, answer the question.\n"
        "Context:\n"
        f"{context}\n\n"
        f"Question: {query}\n"
        "Answer (as bullet points):"
    )

    # Generate answer
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=512)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    # Return only the clean answer

    if "Answer (as bullet points):" in answer:
      answer = answer.split("Answer (as bullet points):")[-1].strip()

    return answer


In [None]:
iface = gr.Interface(
    fn=gradio_rag_bot,
    inputs=gr.Textbox(lines=2, label="Ask a question about the debates"),
    outputs=gr.Textbox(lines=20, label="Answer"),
    title="Debates RAG Chatbot",
    description="Ask about historical UK Parliament debates (1919–2024)."
)


In [None]:
iface.launch(share=True)  # 'share=True' gives you a public link in Colab
