In [1]:
import re
import nltk
import numpy as np
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup
from typing import List, Tuple
from nltk.tokenize import sent_tokenize
from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict


Import Datasets

In [2]:
ds1 = load_dataset("ccdv/govreport-summarization")
ds2 = load_dataset("FiscalNote/billsum")

# GovReport splits (report, summary)
gov_train = ds1["train"].rename_columns({"report": "text"})
gov_val   = ds1["validation"].rename_columns({"report": "text"})
gov_test  = ds1["test"].rename_columns({"report": "text"})

# BillSum splits (text, summary, title)
bill_train = ds2["train"]
bill_test  = ds2["test"]
bill_ca_test = ds2["ca_test"]  # treat as out of domain test for model generalisation


README.md: 0.00B [00:00, ?B/s]

document/train-00000-of-00002.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

document/train-00001-of-00002.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

document/validation-00000-of-00001.parqu(…):   0%|          | 0.00/26.1M [00:00<?, ?B/s]

document/test-00000-of-00001.parquet:   0%|          | 0.00/24.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17517 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/973 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/973 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

data/ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Data Cleaning

In [3]:
# remove boilerplate for text

header_irrelevant = [
    r"^Page\s+\d+(\s+of\s+\d+)?\s*$", # pages e.g. Page 3
    r"^\d+\s*$", # numbers e.g. 13
    r"^–\s*\d+\s*–$", # e.g. - 12 -
    r"^\s*U\.S\. Government Accountability Office.*$", # common report headers
    r"^\s*Congressional Research Service.*$",
    r"^\s*Congressional Budget Office.*$",
    r"^\s*GAO-\d{2}-\d+\s*$", # report ids
    r"^\s*For Official Use Only\s*$",
    r"^\s*This report was prepared by.*$",
]

toc_irrelevant = [
    r"^\s*Table of Contents\s*$",
    r"\.{2,}\s*\d+\s*$", # dotted lines with page numbers
]

# remove excessive whitespaces, line breaks
def remove_whitespaces(text):
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text) # multiple spaces into single space
    text = re.sub(r'\n{3,}', '\n\n', text) # 3+ consecutive newlines into two to keep paragraph breaks
    return text.strip() # drop leading/trailing whitespaces

def remove_boilerplate(text):
    if not text or not isinstance(text, str):
        return text

    # standardize unicode/punctuation for tokenizer consistency
    text = unicodedata.normalize("NFKC", text)

    lines = text.splitlines()
    cleaned = []

    inside_toc = False
    for line in lines:
        line_stripped = line.strip()

        if any(re.match(i, line_stripped) for i in header_irrelevant):
            continue

        if any(re.match(i, line_stripped) for i in toc_irrelevant):
            inside_toc = True
            continue

        if inside_toc:
            # skip TOC lines till normal paragraph appears
            if re.search(r"\.{2,}\s*\d+\s*$", line_stripped):
                continue
            inside_toc = False

        cleaned.append(line)

    cleaned_text = "\n".join(cleaned)
    cleaned_text = remove_whitespaces(cleaned_text)
    # remove rule lines ------ / =====
    cleaned_text = re.sub(r"(?:^|\n)[\-=]{4,}(?:\n|$)", "\n", cleaned_text)
    # remove HTML tags
    tags = re.compile(r"<[^>]+>|&[a-zA-Z]+;")
    if isinstance(cleaned_text, str) and tags.search(cleaned_text):
        cleaned_text = BeautifulSoup(cleaned_text, "html.parser").get_text(separator=" ")

    return cleaned_text

gov_train = gov_train.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
gov_val = gov_val.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
gov_test = gov_test.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})

bill_train = bill_train.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
bill_test = bill_test.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
bill_ca_test = bill_ca_test.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})


Map:   0%|          | 0/17517 [00:00<?, ? examples/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/18949 [00:00<?, ? examples/s]

Map:   0%|          | 0/3269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [4]:
# remove minimal whitespaces for summary
def clean_summary(text):
    if not isinstance(text, str):
        return text
    text = re.sub(r'\s+', ' ', text).strip()
    return text

gov_train = gov_train.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
gov_val   = gov_val.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
gov_test  = gov_test.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})

bill_train = bill_train.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
bill_test  = bill_test.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
bill_ca_test   = bill_ca_test.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})


Map:   0%|          | 0/17517 [00:00<?, ? examples/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/18949 [00:00<?, ? examples/s]

Map:   0%|          | 0/3269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

Data Preprocessing

In [5]:
import nltk
from nltk.tokenize import sent_tokenize
import re

# Download punkt if not already available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

def safe_sent_split(text):
    """
    Safely split text into sentences with heading merging
    """
    if not isinstance(text, str) or not text.strip():
        return []

    try:
        sents = sent_tokenize(text)

        # Merge headings with next sentence
        output = []
        heading = None

        for s in sents:
            s = s.strip()
            if not s:
                continue

            # Check if this looks like a heading
            if (heading is None and
                len(s) <= 25 and
                (s.endswith(":") or re.match(r"^[A-Z][A-Za-z0-9 \-]{0,20}:?$", s))):
                heading = s
                continue

            # If we have a heading, merge it with current sentence
            if heading is not None:
                output.append(f"{heading} {s}".strip())
                heading = None
            else:
                output.append(s)

        # Add any remaining heading
        if heading is not None:
            output.append(heading)

        return output

    except Exception as e:
        print(f"Error in sentence splitting: {e}")
        # Fallback: simple split by periods
        return [s.strip() for s in text.split('.') if s.strip()]

# Alternative simpler approach without heading merging
def simple_sent_split(text):
    """Simple sentence splitting without heading logic"""
    if not isinstance(text, str) or not text.strip():
        return []

    try:
        return [s.strip() for s in sent_tokenize(text) if s.strip()]
    except:
        return [s.strip() for s in text.split('.') if s.strip()]

# Apply to datasets using the simpler approach
gov_train = gov_train.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)
gov_val = gov_val.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)
gov_test = gov_test.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)

bill_train = bill_train.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)
bill_test = bill_test.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)
bill_ca_test = bill_ca_test.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)

Map:   0%|          | 0/17517 [00:00<?, ? examples/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/18949 [00:00<?, ? examples/s]

Map:   0%|          | 0/3269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [6]:
# dataset splits for modeling
split = bill_test.train_test_split(test_size=0.5, seed=42)
bill_val = split["train"]
bill_test = split["test"]

# Combined
comb_train = concatenate_datasets([gov_train, bill_train.select_columns(['text', 'summary', 'clean_text', 'clean_summary', 'sentences'])])
comb_val = concatenate_datasets([gov_val, bill_val.select_columns(['text', 'summary', 'clean_text', 'clean_summary', 'sentences'])])
comb_test = concatenate_datasets([gov_test, bill_test.select_columns(['text', 'summary', 'clean_text', 'clean_summary', 'sentences'])])
# add source column
comb_train = comb_train.add_column("source",["govreport"] * len(gov_train) + ["billsum"] * len(bill_train))
comb_val = comb_val.add_column("source",["govreport"] * len(gov_val) + ["billsum"] * len(bill_val))
comb_test = comb_test.add_column("source",["govreport"] * len(gov_test) + ["billsum"] * len(bill_test))
# shuffle to avoid bias
comb_train = comb_train.shuffle(seed=42)
comb_val = comb_val.shuffle(seed=42)
comb_test = comb_test.shuffle(seed=42)

Flattening the indices:   0%|          | 0/2607 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/2608 [00:00<?, ? examples/s]

Modeling

Extractive Baseline: TF-IDF Cosine Similarity

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset

def extractive_summary(sentences: list, top_n: int = 10) -> str:
    """Return extractive summary using sentence similarity"""
    if not sentences:
        return ""
    if len(sentences) <= top_n:
        return " ".join(sentences)

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)

    # Cosine similarity matrix
    sim_matrix = cosine_similarity(X, X)

    # Sentence scores: sum of similarities
    scores = sim_matrix.sum(axis=1)

    # Pick top_n sentences
    top_indices = np.argsort(scores)[-top_n:][::-1]

    # Return sentences in original order
    summary = [sentences[i] for i in sorted(top_indices)]
    return " ".join(summary)

# Add extractive summaries to dataset
def add_extractive_predictions(dataset: Dataset, top_n: int = 10) -> Dataset:
    summaries = [extractive_summary(ex["sentences"], top_n=top_n) for ex in dataset]
    return dataset.add_column("extractive_summary", summaries)

# Example: test on small subset
#comb_val_small = comb_val.select(range(100))  # first 100 for quick run
#comb_val_small = add_extractive_predictions(comb_val_small, top_n=16)
#comb_val_small = add_extractive_predictions(comb_val, top_n=16)
comb_val_small = add_extractive_predictions(comb_test, top_n=16)

# ROUGE Evaluation
import evaluate

rouge = evaluate.load("rouge")

preds = comb_val_small["extractive_summary"]
refs  = comb_val_small["clean_summary"]

results = rouge.compute(predictions=preds, references=refs)
print("Initial Extractive ROUGE Scores:", results)


Flattening the indices:   0%|          | 0/2608 [00:00<?, ? examples/s]

Initial Extractive ROUGE Scores: {'rouge1': np.float64(0.37631008849641945), 'rouge2': np.float64(0.1872128193565502), 'rougeL': np.float64(0.21576787127955505), 'rougeLsum': np.float64(0.2329480630700775)}


Abstractive Summarization Model: BART

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch

model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

demo_train = comb_train.select(range(500))
demo_val = comb_val.select(range(100))
# Max lengths for legal docs (you can increase if needed)
max_input_length = 512
max_target_length = 64

def preprocess(batch):
    # Tokenize input legal text
    inputs = tokenizer(batch["clean_text"], truncation=True, padding="max_length", max_length=max_input_length)
    # Tokenize summaries
    targets = tokenizer(batch["clean_summary"], truncation=True, padding="max_length", max_length=max_target_length)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply preprocessing
train_dataset = demo_train.map(preprocess, batched=True)
val_dataset = demo_val.map(preprocess, batched=True)



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [16]:
# Data collator handles padding dynamically
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments for Colab GPU
training_args = TrainingArguments(
    output_dir="./legal_summarizer",
    eval_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    logging_steps=50,
    eval_steps=100,
    save_steps=100,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,                # Use mixed precision for speed
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start fine-tuning
trainer.train()


  trainer = Trainer(


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
100,2.5339,2.158924
200,2.3063,1.873816
300,1.3669,1.911637
400,1.4334,1.9001
500,1.356,1.832738
600,0.8699,2.000755
700,0.749,2.014638




TrainOutput(global_step=750, training_loss=1.540694585164388, metrics={'train_runtime': 20885.176, 'train_samples_per_second': 0.072, 'train_steps_per_second': 0.036, 'total_flos': 1160934064128000.0, 'train_loss': 1.540694585164388, 'epoch': 3.0})

In [19]:
from huggingface_hub import login

# Paste your HF token here
login()

model.push_to_hub("legal-summarizer-distilbart")
tokenizer.push_to_hub("legal-summarizer-distilbart")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ddxczlc/model.safetensors:   0%|          |  131kB / 1.22GB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AtharvaKirk/legal-summarizer-distilbart/commit/7d4c3ebcbf6ac0c238583fd9be32023e4cfe5670', commit_message='Upload tokenizer', commit_description='', oid='7d4c3ebcbf6ac0c238583fd9be32023e4cfe5670', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AtharvaKirk/legal-summarizer-distilbart', endpoint='https://huggingface.co', repo_type='model', repo_id='AtharvaKirk/legal-summarizer-distilbart'), pr_revision=None, pr_num=None)

In [None]:
# load fine-tuned model for evaluation
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_name = "AtharvaKirk/legal-summarizer-distilbart"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

2025-11-28 04:52:25.333330: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764305545.531793      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764305545.580495      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

def token_chunks_split(text, max_tokens=480):
    if not isinstance(text, str) or not text.strip():
        return []
    sentences = sent_tokenize(text)
    chunks = []
    current_text = ""
    current_len = 0

    for sent in sentences:
        tokens = tokenizer(sent, add_special_tokens=False, return_attention_mask=False, return_tensors=None,)
        sent_len = len(tokens["input_ids"])
        if current_text and current_len + sent_len > max_tokens:
            chunks.append(current_text.strip())
            current_text = sent
            current_len = sent_len
        else:
            if current_text:
                current_text += " " + sent
            else:
                current_text = sent
            current_len += sent_len
    if current_text:
        chunks.append(current_text.strip())
    return chunks


In [None]:
def summarize_chunk(text,max_input_length=512,min_target_length=40,max_target_length=80,num_beams=4,):
    if not isinstance(text, str) or not text.strip():
        return ""
    inputs = tokenizer(text,return_tensors="pt",truncation=True,max_length=max_input_length,).to(device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            num_beams=num_beams,
            length_penalty=0.9,
            min_length=min_target_length,
            max_length=max_target_length,
            no_repeat_ngram_size=4,
            early_stopping=True,
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [None]:
def get_summary(text,chunk_max_tokens=480,chunk_min_len=150,chunk_max_len=250,final_min_len=180,final_max_len=260,num_beams=6,max_chunks=20):
    chunks = token_chunks_split(text, max_tokens=chunk_max_tokens)
    if max_chunks is not None and len(chunks) > max_chunks:
        chunks = chunks[:max_chunks]
    if not chunks:
        return ""
    if len(chunks) == 1:
        return summarize_chunk(chunks[0],min_target_length=final_min_len,max_target_length=final_max_len,num_beams=num_beams,)

    chunk_summaries = []
    for ch in chunks:
        s = summarize_chunk(ch,min_target_length=chunk_min_len,max_target_length=chunk_max_len,num_beams=num_beams,)
        chunk_summaries.append(s)

    #meta_input = "\n".join(chunk_summaries)
    meta_input = "\n".join(chunk_summaries[-10:]) # more weighted to last chunks

    final_summary = summarize_chunk(meta_input,min_target_length=final_min_len,max_target_length=final_max_len,num_beams=num_beams,)
    return final_summary


In [None]:
# test example
idx = 0
example = comb_test[idx]
summary = get_summary(example["clean_text"],chunk_max_tokens=480,chunk_min_len=150,chunk_max_len=250,final_min_len=200,final_max_len=350,num_beams=6,max_chunks=20)
print(summary)


HIERARCHICAL SUMMARY:
 The Trump Administration has consistently pursued the deployment of fencing, walls, and other barriers along the U.S.-Mexico border as a high priority. On April 4, 2018, the President, citing "a drastic surge of activity" on the southern border, directed the Secretary of Defense, the Attorney General, and the Secretary of Homeland Security to coordinate action on securing the border "to stop the flow of deadly drugs and other contraband, gang members and other criminals, and illegal aliens" The President also directed DOD to mobilize the National Guard to support DHS at the border and to develop a plan for tapping additional military resources using executive authorities. Later that year, as part of budget negotiations, the Administration authorized $3.6 billion in defense funds authorized under (emergency authority of Title 10 U.S. Customs and Border Protection). GAO was asked to provide additional assistance to DHS with respect to the construction of additional

In [None]:
from tqdm.auto import tqdm

def data_summaries(dataset, max_samples=100):
    preds, refs = [], []
    n = min(max_samples, len(dataset))
    for i in tqdm(range(n)):
        ex = dataset[i]
        text = ex["clean_text"]
        ref  = ex["clean_summary"]
        pred = get_summary(text,chunk_max_tokens=480,chunk_min_len=150,chunk_max_len=250,final_min_len=200,final_max_len=350,num_beams=6,max_chunks=20)
        preds.append(pred)
        refs.append(ref)
    return preds, refs
preds_h, refs_h = data_summaries(comb_val, max_samples=100)


  0%|          | 0/100 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1682 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
import evaluate

rouge = evaluate.load("rouge")
results = rouge.compute(predictions=preds_h, references=refs_h)
results


Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': 0.40748489062516313,
 'rouge2': 0.20506921379162757,
 'rougeL': 0.2778156587164773,
 'rougeLsum': 0.278403223219837}

In [None]:
# demo interface
import gradio as gr

def index_summary(idx):
    idx = int(idx)
    ex = comb_test[idx]
    return get_summary(ex["clean_text"]), ex["clean_text"]

iface = gr.Interface(
    fn=index_summary,
    inputs=gr.Number(label="Index in data"),
    outputs=[gr.Textbox(label="Summary"), gr.Textbox(label="Original Document"),],
    title="Law Policy Summarizer"
)
iface.launch()


* Running on local URL:  http://127.0.0.1:7868
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://d9e5236ce8b78e7de6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




QA Retrieval

In [20]:
!pip install sentence-transformers faiss-cpu gradio


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.0


In [17]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load pretrained embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode all cleaned summaries from training set
summary_texts = comb_train["clean_summary"]
summary_embeddings = embed_model.encode(summary_texts, convert_to_numpy=True, show_progress_bar=True)

# Create FAISS index
embedding_dim = summary_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(summary_embeddings)

print(f"FAISS index ready with {index.ntotal} vectors.")


ModuleNotFoundError: No module named 'faiss'

In [None]:
import torch

def answer_question(query, top_k=3):
    # Embed the query
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    # Retrieve top-k similar summaries
    D, I = index.search(q_emb, k=top_k)
    retrieved_texts = " ".join([summary_texts[i] for i in I[0]])

    # Tokenize retrieved context
    inputs = tokenizer(retrieved_texts, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    # Generate answer
    summary_ids = model.generate(**inputs, max_length=64)
    answer = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return answer


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load pretrained embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode all cleaned summaries from training set
summary_texts = comb_train["clean_summary"]
summary_embeddings = embed_model.encode(summary_texts, convert_to_numpy=True, show_progress_bar=True)

# Create FAISS index
embedding_dim = summary_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(summary_embeddings)

print(f"FAISS index ready with {index.ntotal} vectors.")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1140 [00:00<?, ?it/s]

In [16]:
import gradio as gr

iface = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(lines=2, placeholder="Ask a question about the law..."),
    outputs=gr.Textbox(),
    title="Legal QA Chatbot",
    description="Ask questions based on the summarized legal documents"
)

iface.launch(share=True)


NameError: name 'answer_question' is not defined