# ATA Analyzer: Semantic RAG for Legislative Transcripts

This notebook extracts, structures, and indexes legislative transcripts for semantic querying using embeddings and FAISS.

## 1. Setup and Imports

In [1]:
import os
import re

import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


## 2. ATA Processor

Extracts speeches and metadata from the markdown transcript.

In [2]:
class ATAProcessor:
    @staticmethod
    def parse_md(md_path):
        if not os.path.exists(md_path):
            raise FileNotFoundError(f"File {md_path} not found.")

        with open(md_path, encoding="utf-8") as f:
            lines = f.readlines()

        data = []
        current_section = "Unknown"

        speaker_pattern = re.compile(r"^(O SR\.|A SRA\.)\s+(.+?)\s*-\s*(.*)", re.DOTALL)
        section_pattern = re.compile(r"^[A-ZÀ-Ú\s]+$")
        page_marker_pattern = re.compile(
            r"^(\d+/\d+|\x0c?Sessão de.*|Notas Taquigráficas|CÂMARA DOS DEPUTADOS|DEPARTAMENTO DE.*|.*SESSÃO LEGISLATIVA.*|.*SESSÃO.*|^\d+$)",
            re.IGNORECASE,
        )

        current_speaker_info = None
        current_speech = []

        for line in lines:
            line = line.strip()
            if not line:
                continue

            if page_marker_pattern.match(line):
                continue

            if (
                section_pattern.match(line)
                and len(line) > 5
                and not speaker_pattern.match(line)
            ):
                if current_speaker_info:
                    data.append(
                        {
                            "section": current_section,
                            "speaker": current_speaker_info["name"],
                            "metadata": current_speaker_info["meta"],
                            "text": " ".join(current_speech).strip(),
                        }
                    )
                    current_speaker_info = None
                    current_speech = []
                current_section = line
                continue

            match = speaker_pattern.match(line)
            if match:
                if current_speaker_info:
                    data.append(
                        {
                            "section": current_section,
                            "speaker": current_speaker_info["name"],
                            "metadata": current_speaker_info["meta"],
                            "text": " ".join(current_speech).strip(),
                        }
                    )

                prefix, info, first_part = match.groups()
                name_meta = re.search(r"\((.+?)\)", info)
                if name_meta:
                    metadata = name_meta.group(1)
                    speaker_name = info.split("(")[0].strip()
                else:
                    speaker_name = info
                    metadata = ""

                current_speaker_info = {"name": speaker_name, "meta": metadata}
                current_speech = [first_part]
            else:
                if current_speaker_info:
                    current_speech.append(line)

        if current_speaker_info:
            data.append(
                {
                    "section": current_section,
                    "speaker": current_speaker_info["name"],
                    "metadata": current_speaker_info["meta"],
                    "text": " ".join(current_speech).strip(),
                }
            )

        return pd.DataFrame(data)

## 3. RAG Engine

Handles embedding generation and semantic indexing.

In [3]:
class RAGEngine:
    def __init__(self, model_name="paraphrase-multilingual-MiniLM-L12-v2"):
        print(f"Loading model: {model_name}...")
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.df = None

    def ingest(self, df):
        print(f"Ingesting {len(df)} speech segments...")
        self.df = df
        texts = df["text"].tolist()
        embeddings = self.model.encode(texts, show_progress_bar=True)

        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(np.array(embeddings).astype("float32"))
        print("Indexing complete.")

    def query(self, text, k=5):
        if self.index is None:
            raise ValueError("Index is empty. Call ingest() first.")

        query_vector = self.model.encode([text])
        distances, indices = self.index.search(
            np.array(query_vector).astype("float32"), k
        )

        results = self.df.iloc[indices[0]].copy()
        results["distance"] = distances[0]
        return results

## 4. Run Analysis

In [4]:
md_file = "resources/ata.md"
processor = ATAProcessor()
df_ata = processor.parse_md(md_file)

engine = RAGEngine()
engine.ingest(df_ata)

# Example query
query = "Quais as principais preocupações sobre o garimpo ilegal no Amazonas?"
results = engine.query(query, k=3)
results

Loading model: paraphrase-multilingual-MiniLM-L12-v2...


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/modules.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x721f587e9430>: Failed to resolve \'huggingface.co\' ([Errno -2] Name or service not known)"))'), '(Request ID: e37a8d29-1433-423c-8049-ebbe993191a1)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/modules.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x721f587e9b20>: Failed to resolve \'huggingface.co\' ([Errno -2] Name or service not known)"))'), '(Request ID: 05b635b6-8338-466b-b9c7-64c

Ingesting 727 speech segments...


Batches: 100%|██████████| 23/23 [00:01<00:00, 12.01it/s]


Indexing complete.


Unnamed: 0,section,speaker,metadata,text,distance
93,ORDEM DO DIA,PRESIDENTE (Hugo Motta. Bloco/REPUBLICANOS,,PB) - Passo a palavra ao Deputado Capitão Albe...,15.997702
19,BREVES COMUNICAÇÕES,RAFAEL FERA (Bloco/PODE,,"RO. Sem revisão do orador.) - Sr. Presidente, ...",19.349174
223,ORDEM DO DIA,CABO GILBERTO SILVA (Bloco/PL,,PB. Pela ordem. Sem revisão do orador.) - Popu...,20.403336
