<a href="https://colab.research.google.com/github/pk2971/computational-gender-analysis/blob/main/notebooks/Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile
import xml.etree.ElementTree as ET
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

In [None]:
!pip install faiss-cpu --no-cache


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m161.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
zip_path = '/content/drive/MyDrive/debates.zip'
# Clean and parse XML text
def extract_text_from_speech(xml_bytes):
    try:
        root = ET.fromstring(xml_bytes)
        return ' '.join([p.text or '' for p in root.findall('.//speech//p')])
    except ET.ParseError:
        return ""

In [None]:
# Step 1: Chunk and embed
def chunk_text(text, chunk_size=1000, overlap=200):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        if chunk:
            chunks.append(chunk)
    return chunks


In [None]:
def build_faiss_index(chunks, embedder):
    embeddings = embedder.encode(chunks, convert_to_numpy=True)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index, embeddings

In [None]:
# Step 2: Retrieval
def retrieve_relevant_chunks(question, chunks, embedder, index, top_k=5):
    q_emb = embedder.encode([question], convert_to_numpy=True)
    D, I = index.search(q_emb, top_k)
    return [chunks[i] for i in I[0]]

In [None]:
# Step 3: Summarization/QA
def summarize_with_model(context, question, model_name="google/flan-t5-small"):
    summarizer = pipeline("summarization", model=model_name)
    input_text = question + "\n" + context
    summary = summarizer(input_text, max_length=150, min_length=30, do_sample=False)
    return summary[0]['summary_text']

In [None]:
# Main function
def rag_summarize(zip_path, year_input, question, extra_stopwords=[]):
    # Collect documents
    if isinstance(year_input, int):
        start_year, end_year = year_input, year_input
    else:
        start_year, end_year = year_input
    year_docs = collect_documents_by_year(zip_path, start_year, end_year)
    text = " ".join(year_docs.values())
    # Chunk and embed
    chunks = chunk_text(text)
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    index, _ = build_faiss_index(chunks, embedder)
    # Retrieve
    relevant_chunks = retrieve_relevant_chunks(question, chunks, embedder, index, top_k=5)
    context = " ".join(relevant_chunks)
    # Summarize/QA
    answer = summarize_with_model(context, question)
    return answer

In [None]:

# Example usage:
# answer = rag_summarize('/content/drive/MyDrive/debates.zip', (1919, 1920), "What were the main concerns about war in parliament?")
# print(answer)
abcd