In [1]:
from datasets import load_dataset
import pandas as pd

billsum = load_dataset("billsum")

billsum_train = billsum["train"]
billsum_test_us = billsum["test"]
billsum_test_ca = billsum["ca_test"]

gov = load_dataset("ccdv/govreport-summarization")

gov_train = gov["train"]
gov_test = gov["test"]
gov_validation = gov["validation"]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
billsum_train[0].keys()

dict_keys(['text', 'summary', 'title'])

In [3]:
gov_train[0].keys()

dict_keys(['report', 'summary'])

In [4]:
billsum_test_ca[0].keys()

dict_keys(['text', 'summary', 'title'])

In [5]:
def standardize_billsum(ds, split_name, jurisdiction):
    df = ds.to_pandas()
    df = df.rename(columns={
        "text": "source_text",
        "summary": "target_summary",
        "title": "title"
    })
    df = df.reset_index(drop=True)
    df['doc_id'] = 'billsum_' + split_name + '_' + df.index.astype(str)
    df['jurisdiction'] = jurisdiction
    df['split'] = split_name
    df['dataset'] = 'billsum'

    return df

def standardize_gov(ds, split_name):
    df = ds.to_pandas()
    df = df.rename(columns={
        "report": "source_text",
        "summary": "target_summary"
    })
    df = df.reset_index(drop=True)
    df['doc_id'] = 'govreport_' + split_name + '_' + df.index.astype(str)
    df['title'] = None
    df['jurisdiction'] = 'us_federal'
    df['split'] = split_name
    df['dataset'] = 'govreport'

    return df

bs_train_df = standardize_billsum(billsum_train, 'train', 'US')
bs_test_us_df = standardize_billsum(billsum_test_us, 'test_us', 'US')
bs_test_ca_df = standardize_billsum(billsum_test_ca, 'test_ca', 'CA')
gov_train_df = standardize_gov(gov_train, 'train')
gov_test_df = standardize_gov(gov_test, 'test')
gov_validation_df = standardize_gov(gov_validation, 'validation')

all_data_df = pd.concat([
    bs_train_df,
    bs_test_us_df,
    bs_test_ca_df,
    gov_train_df,
    gov_test_df,
    gov_validation_df,
], ignore_index=True)

print(all_data_df.columns)

Index(['source_text', 'target_summary', 'title', 'doc_id', 'jurisdiction',
       'split', 'dataset'],
      dtype='object')


In [6]:
all_data_df[all_data_df['doc_id'].str.endswith('_0')]['doc_id']

0               billsum_train_0
18949         billsum_test_us_0
22218         billsum_test_ca_0
23455         govreport_train_0
40972          govreport_test_0
41945    govreport_validation_0
Name: doc_id, dtype: object

In [7]:
len(all_data_df), all_data_df["doc_id"].nunique()

(42918, 42918)

In [8]:
all_data_df['text_len'] = all_data_df['source_text'].str.len()
all_data_df['summary_len'] = all_data_df['target_summary'].str.len()

def compute_length_stats(group):
    return pd.Series({
        'n_docs': len(group),
        'text_len_mean': group['text_len'].mean(),
        'text_len_p90': group['text_len'].quantile(0.9),
        'text_len_max': group['text_len'].max(),
        'sum_len_mean': group['summary_len'].mean(),
        'sum_len_p90': group['summary_len'].quantile(0.9),
        'sum_len_max': group['summary_len'].max(),
    })

stats_by_dataset = all_data_df.groupby(['dataset', 'split']).apply(compute_length_stats).reset_index()

  stats_by_dataset = all_data_df.groupby(['dataset', 'split']).apply(compute_length_stats).reset_index()


In [9]:
stats_by_dataset

Unnamed: 0,dataset,split,n_docs,text_len_mean,text_len_p90,text_len_max,sum_len_mean,sum_len_p90,sum_len_max
0,billsum,test_ca,1237.0,9729.518189,15925.4,19998.0,2168.582862,3681.4,21249.0
1,billsum,test_us,3269.0,10268.095442,16853.4,19998.0,1184.728357,2165.2,4986.0
2,billsum,train,18949.0,10272.542931,16775.0,19998.0,1185.639981,2151.0,4995.0
3,govreport,test,973.0,49016.432682,85629.2,193646.0,3851.485098,4788.6,9722.0
4,govreport,train,17517.0,51072.65582,91715.8,1323870.0,3202.116687,4651.0,13652.0
5,govreport,validation,973.0,53380.00925,93341.2,348468.0,3832.309353,4818.2,9231.0


In [10]:
from transformers import AutoTokenizer

t5_tokenizer = AutoTokenizer.from_pretrained("t5-base")

led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

#### No need to run the following code again

In [11]:
def chunk_text(text, tokenizer, max_tokens, overlap):
    enc = tokenizer(text, truncation=False, padding = False, return_attention_mask=False,)
    input_ids = enc['input_ids']

    chunks = []
    start = 0
    while start < len(input_ids):
        end = start + max_tokens
        chunk_ids = input_ids[start:end]
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True,)

        chunks.append({
            'chunk_text': chunk_text,
            'start_token': start,
            'end_token': min(end, len(input_ids)),
        })

        if end >= len(input_ids):
            break

        start = end - overlap
    
    return chunks

def make_chunk_df(df, tokenizer, max_tokens, overlap):
    records = []
    for _, row in df.iterrows():
        doc_chunks = chunk_text(row['source_text'], tokenizer, max_tokens, overlap)

        for cid, ch in enumerate(doc_chunks):
            records.append({
                'doc_id': row['doc_id'],
                'dataset': row['dataset'],
                'split': row['split'],
                "jurisdiction": row["jurisdiction"],
                "title": row["title"],
                "chunk_id": cid,
                "chunk_text": ch["chunk_text"],
                "start_token": ch["start_token"],
                "end_token": ch["end_token"],
                "target_summary": row["target_summary"],
            })
    return pd.DataFrame(records)

In [34]:
billsum_docs = all_data_df[all_data_df["dataset"] == "billsum"].copy()
govreport_docs = all_data_df[all_data_df["dataset"] == "govreport"].copy()

billsum_chunks = make_chunk_df(
    billsum_docs,
    tokenizer=t5_tokenizer,
    max_tokens=512,
    overlap=64,
)

govreport_chunks = make_chunk_df(
    govreport_docs,
    tokenizer=led_tokenizer,
    max_tokens=1024,
    overlap=128,
)


print("BillSum chunks:", len(billsum_chunks))
print("GovReport chunks:", len(govreport_chunks))
billsum_chunks.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (19499 > 16384). Running this sequence through the model will result in indexing errors


BillSum chunks: 112176
GovReport chunks: 215783


Unnamed: 0,doc_id,dataset,split,jurisdiction,title,chunk_id,chunk_text,start_token,end_token,target_summary
0,billsum_train_0,billsum,train,US,A bill to limit the civil liability of busines...,0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,0,512,Shields a business entity from civil liability...
1,billsum_train_0,billsum,train,US,A bill to limit the civil liability of busines...,1,such facility by a nonprofit organization if--...,448,960,Shields a business entity from civil liability...
2,billsum_train_0,billsum,train,US,A bill to limit the civil liability of busines...,2,applicability.--This Act shall not apply to an...,896,983,Shields a business entity from civil liability...
3,billsum_train_1,billsum,train,US,Human Rights Information Act,0,SECTION 1. SHORT TITLE. This Act may be cited ...,0,512,Human Rights Information Act - Requires certai...
4,billsum_train_1,billsum,train,US,Human Rights Information Act,1,in foreign countries. These efforts are thwart...,448,960,Human Rights Information Act - Requires certai...


In [35]:
billsum_chunks.to_parquet("billsum_chunks.parquet")
govreport_chunks.to_parquet("govreport_chunks.parquet")

#### Start here again

In [11]:
billsum_chunks = pd.read_parquet("billsum_chunks.parquet")
govreport_chunks = pd.read_parquet("govreport_chunks.parquet")

In [12]:
govreport_chunks.shape

(215783, 10)

#### base testing

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

billsum_chunks = billsum_chunks.reset_index(drop=True)

tfidf_vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),
)

chunk_tfidf = tfidf_vectorizer.fit_transform(billsum_chunks['chunk_text'])

def retrieve_billsum_chunks(query, top_k=5):
    query_tfidf = tfidf_vectorizer.transform([query])
    similarities = cosine_similarity(query_tfidf, chunk_tfidf)[0]
    top_indices = similarities.argsort()[::-1][:top_k]

    results = billsum_chunks.iloc[top_indices].copy()
    results['score'] = similarities[top_indices]
    return results


retrieve_billsum_chunks("liability of business entities providing facilities to nonprofits", top_k=3)[
    ['doc_id', 'chunk_id', 'score', 'chunk_text']
]

Unnamed: 0,doc_id,chunk_id,score,chunk_text
45808,billsum_train_9579,1,0.332038,"welfare, or health purposes. (10) Person.--The..."
29181,billsum_train_6083,1,0.292353,", the Northern Mariana Islands, any other terr..."
47661,billsum_train_9973,4,0.203548,iii) Exception for corporations electing reduc...


#### No need to run later cells - base testing extended

In [42]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

rag_tokenizer = AutoTokenizer.from_pretrained("t5-base")
rag_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

def answer_query_billsum(query, top_k=5, max_input_tokens=512, max_output_tokens=128):
    retrieved = retrieve_billsum_chunks(query, top_k=top_k)
    context = '\n\n'.join(retrieved['chunk_text'].tolist())

    prompt = prompt = (
        "You are a legal assistant. Answer the question **only** using the context.\n"
        "If the answer is a set of conditions, list **all** conditions clearly.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\n"
        "Answer:"
    )

    inputs = rag_tokenizer(
        prompt, 
        return_tensors='pt',
        truncation=True,
        max_length=max_input_tokens,
    )

    outputs = rag_model.generate(
        **inputs,
        max_length=max_output_tokens,
        num_beams=4,
    )

    answer = rag_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer, retrieved


ans, ctx = answer_query_billsum(
    "What are the regulations regarding liability of business entities providing facilities to nonprofits?",
    top_k=7,
    max_input_tokens=512,
    max_output_tokens=200,
)

print("Answer:", ans)

Answer: (A) In general.--Subject to subsection (b), a business entity shall not be subject to civil liability relating to any injury or death that results from the use of equipment donated by the business entity to a nonprofit organization.


In [44]:
ans, _ = answer_query_billsum(
    'When is a business entity shielded from liability when providing facilities to nonprofit organizations?',
    top_k=2,
    max_input_tokens=512,
    max_output_tokens=200,
)
print("Answer:", ans)

Answer: , the Northern Mariana Islands, any other State, territory, or possession of the United States, or any political subdivision of any such State, territory, or possession


In [49]:
from transformers import AutoModelForQuestionAnswering, pipeline

qa_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

qa_pipeline = pipeline(
    "question-answering",
    model=qa_model,
    tokenizer=qa_tokenizer,
)

def answer_qa_billsum(query, top_k=5, window_size=300):
    retrieved = retrieve_billsum_chunks(query, top_k=top_k)
    

    best = None
    best_score = float('-inf')
    best_row = None
    
    for _, row in retrieved.iterrows():
        context = row['chunk_text']
        result = qa_pipeline(question=query, context=context)

        if result['score'] > best_score:
            best = result
            best_score = result['score']
            best_row = row
    if best is None:
        return None

    chunk_text = best_row['chunk_text']
    start_char = max(0, best['start'] - window_size)
    end_char = min(len(chunk_text), best['end'] + window_size)
    window = chunk_text[start_char:end_char]
    return {
        'question': query,
        'answer': best['answer'],
        'score': best['score'],
        'start': best['start'],
        'end': best['end'],
        'source_doc_id': best_row['doc_id'],
        'source_chunk_id': best_row['chunk_id'],
        'clause_context': window,
        'full_chunk_text': chunk_text,
    }



def rewrite_clause(clause, question=None, max_input_tokens=512, max_output_tokens=256):
    if question is not None:
        prompt = (
            "You are a legal assistant. Based on the clause, answer the question "
            "in clear, plain English."
            "Preserve all conditions; do not add or remove any.\n\n"
            f"Question:\n{question}\n\n"
            f"Clause:\n{clause}\n\n"
            "Rewritten Clause:"
        )
    else:
        prompt = (
            "You are a legal assistant. Rewrite the following legal clause in clear, plain English. "
            "Preserve all legal conditions and do not add or remove any requirements.\n\n"
            f"{clause}\n\n"
            "Plain English version:\n"
        )

    inputs = t5_tokenizer(
        prompt, 
        return_tensors='pt',
        truncation=True,
        max_length=max_input_tokens,
    )

    outputs = rag_model.generate(
        **inputs,
        max_length=max_output_tokens,
        num_beams=4,
    )

    rewritten = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return rewritten

def explain_clause(query, top_k=3, window_size=300):
    loc = answer_qa_billsum(query, top_k=top_k, window_size=window_size)
    if loc is None:
        print("No answer found.")
        return
    clause = loc['clause_context']
    rewritten = rewrite_clause(clause, question=query)

    print("Original Clause:\n", clause)
    print("\nRewritten Clause:\n", rewritten)
    print("\nSource Document ID:", loc['source_doc_id'])
    print("Source Chunk ID:", loc['source_chunk_id'])
    print("Answer Span:", loc['answer'])

explain_clause("What are the regulations regarding liability of business entities providing facilities to nonprofits?")

Device set to use cuda:0


Original Clause:
 .--Subject to subsection (c), a business entity shall not be subject to civil liability relating to any injury or death that results from the use of equipment donated by a business entity to a nonprofit organization. (B) Application.--This paragraph shall apply with respect to civil liability under Federal and State law. (2) Liability of business entities providing use of facilities to nonprofit organizations.-- (A) In general.--Subject to subsection (c), a business entity shall not be subject to civil liability relating to any injury or death occurring at a facility of the business entity in connection with a use

Rewritten Clause:
 False

Source Document ID: billsum_train_6083
Source Chunk ID: 1
Answer Span: Federal and State law


#### Run from here again

In [14]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("AtharvaKirk/legal-summarizer-distilbart")
model = AutoModelForSeq2SeqLM.from_pretrained("AtharvaKirk/legal-summarizer-distilbart")
# model.to("cuda")



In [15]:
model.to('cuda')
model.eval()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        

#### Testing with team's summarizer - doing base testing with tfidf first

In [16]:
bill_eval_df = bs_test_us_df.copy()

import nltk
import numpy as np
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize

def build_sentence_corpus(texts):
    sent_lists = []
    sent_flat = []

    for t in texts:
        sents = sent_tokenize(t)
        sent_lists.append(sents)
        sent_flat.extend(sents)
    return sent_lists, sent_flat

bill_sent_lists, bill_sent_flat = build_sentence_corpus(bill_eval_df['source_text'].tolist())

tfidf_sent = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),
    stop_words='english',
)

tfidf_sent_matrix = tfidf_sent.fit_transform(bill_sent_flat)

sent_idx = []
for doc_id, sents in enumerate(bill_sent_lists):
    for sent_id, _ in enumerate(sents):
        sent_idx.append((doc_id, sent_id))

sent_idx = np.array(sent_idx)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vaibh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
def get_base_summary(doc_idx, top_k=5):

    mask = (sent_idx[:,0] == doc_idx)
    sent_idx_global = np.where(mask)[0]

    if len(sent_idx_global) == 0:
        return ""
    
    doc_sent_matrix = tfidf_sent_matrix[sent_idx_global]

    doc_vec = doc_sent_matrix.mean(axis=0)
    doc_vec = np.asarray(doc_vec).reshape(1, -1)

    sims = cosine_similarity(doc_vec, doc_sent_matrix)[0]

    top_idx_local = sims.argsort()[::-1][:top_k]
    top_idx_local_sorted = sorted(top_idx_local)

    sents = bill_sent_lists[doc_idx]
    chosen_sents = [sents[i] for i in top_idx_local_sorted]

    return ' '.join(chosen_sents)

In [18]:
import evaluate
rouge = evaluate.load("rouge")

def evaluate_base_rouge(df, n_samples=200):
    preds = []
    refs = []

    for i in range(min(n_samples, len(df))):
        pred = get_base_summary(i, top_k=5)
        ref = df.iloc[i]['target_summary']
        preds.append(pred)
        refs.append(ref)
    
    scores = rouge.compute(predictions=preds, references=refs)

    return scores
    # return {
    #     'rouge1': scores['rouge1'].mid.fmeasure,
    #     'rouge2': scores['rouge2'].mid.fmeasure,
    #     'rougeL': scores['rougeL'].mid.fmeasure,
    #     'rougeLsum': scores['rougeLsum'].mid.fmeasure,
    # }

tfidf_rouge = evaluate_base_rouge(bill_eval_df, n_samples=200)
print("TF-IDF Baseline ROUGE Scores:", tfidf_rouge)

TF-IDF Baseline ROUGE Scores: {'rouge1': np.float64(0.27571495143857105), 'rouge2': np.float64(0.12847010696246908), 'rougeL': np.float64(0.18017979873291967), 'rougeLsum': np.float64(0.2280725616872361)}


In [20]:
import torch
import evaluate
rouge = evaluate.load("rouge")
def get_abstractive_summary(text, max_input_tokens=1024, max_output_tokens=256):
    inputs = tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=max_input_tokens,
    ).to('cuda')
    # with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=max_output_tokens,
        num_beams=4,
        early_stopping=True,
        length_penalty=1.0,
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

def evaluate_abstractive_rouge(df, n_samples=200):
    preds = []
    refs = []

    for i in range(min(n_samples, len(df))):
        text = df.iloc[i]['source_text']
        ref = df.iloc[i]['target_summary']
        pred = get_abstractive_summary(text, max_input_tokens=1024, max_output_tokens=256)
        preds.append(pred)
        refs.append(ref)
    scores = rouge.compute(predictions=preds, references=refs)
    return scores
## not running this cell to save time
# abstractive_rouge = evaluate_abstractive_rouge(bill_eval_df, n_samples=200)
# print("Abstractive Model ROUGE Scores:", abstractive_rouge)

In [27]:
bs_train_df['source_text'].iloc[1]

"SECTION 1. SHORT TITLE.\n\n    This Act may be cited as the ``Human Rights Information Act''.\n\nSEC. 2. FINDINGS.\n\n    Congress finds the following:\n            (1) The people of the United States consider the national \n        and international protection and promotion of human rights and \n        the rule of law the most important values of any democracy. The \n        founding fathers defined human rights prominently in the Bill \n        of Rights, giving those rights a special priority and \n        protection in the Constitution.\n            (2) Federal agencies are in possession of documents \n        pertaining to gross human rights violations abroad which are \n        needed by foreign authorities to document, investigate, and \n        subsequently prosecute instances of continued and systematic \n        gross human rights violations, including those directed against \n        citizens of the United States.\n            (3) The United States will continue to receive

In [26]:
gov_train_df['source_text'].iloc[1]

'Most income derived from private sector business activity in the United States is subject to federal corporate income tax, the individual income tax, or both. The tax treatment that applies to a business depends on its legal form of organization. Firms that are organized under the tax code as “C” corporations (which include most large, publicly held corporations) have their profits taxed once at the entity level under the corporate income tax (on a form 1120) and then a second time under the individual income tax when profits are transferred to individual shareholders in the form of dividends or realized capital gains. Firms that are organized as “pass-through” entities, such as partnerships, limited liability companies, and “S” corporations are generally not taxed at the entity level; however, their net incomes are passed through each year and taxed in the hands of their partners or shareholders under the individual income tax (as part of those taxpayers’ form 1040 filing). Similarly

#### Not running the cells from here to save time

In [None]:
# doing section-aware chunking for billsum, cannot for govreport because of lack of section markers
import re
section_pattern = re.compile(
    r'^(SECTION\s+\d+\.|SEC\.\s*\d+\.|Sec\.\s*\d+\.)',
    re.MULTILINE
)

def split_sections(text):
    matches = list(section_pattern.finditer(text))
    sections = []
    if not matches:
        cleaned = text.strip()
        if cleaned:
            return [('FULL TEXT', cleaned)]
            
        else:
            return []
    
    first_start = matches[0].start() 
    preamble = text[:first_start].strip()

    if preamble:
        sections.append(('PREAMBLE', preamble))
    
    for i, match in enumerate(matches):
        start = match.start()
        next = matches[i+1].start() if i + 1 < len(matches) else len(text)

        newline_idx = text.find('\n', start)
        if newline_idx == -1 or newline_idx >= next:
            section_title = text[start:next].strip()
            section_body = ''
        else:
            section_title = text[start:newline_idx].strip()
            section_body = text[newline_idx:next].strip()

        sections.append((section_title, section_body))
    return sections

In [None]:
split_sections(bs_train_df['source_text'].iloc[1])

[('SECTION 1. SHORT TITLE.',
  "This Act may be cited as the ``Human Rights Information Act''."),
 ('SEC. 2. FINDINGS.',
  "Congress finds the following:\n            (1) The people of the United States consider the national \n        and international protection and promotion of human rights and \n        the rule of law the most important values of any democracy. The \n        founding fathers defined human rights prominently in the Bill \n        of Rights, giving those rights a special priority and \n        protection in the Constitution.\n            (2) Federal agencies are in possession of documents \n        pertaining to gross human rights violations abroad which are \n        needed by foreign authorities to document, investigate, and \n        subsequently prosecute instances of continued and systematic \n        gross human rights violations, including those directed against \n        citizens of the United States.\n            (3) The United States will continue to receiv

In [None]:
def make_billsum_chunk(df, tokenizer, max_tokens, overlap):
    records = []

    for _, row in df.iterrows():
        doc_id = row['doc_id']
        dataset = row['dataset']
        split = row['split']
        jurisdiction = row['jurisdiction']
        title = row['title']
        full_text = row['source_text']
        target_summary = row['target_summary']

        sections = split_sections(full_text)
        
        global_chunk_id = 0

        for section_idx, (section_title, section_body) in enumerate(sections):
            if section_body:
                combined = section_title + '\n' + section_body
            else:
                combined = section_title
            
            section_chunks = chunk_text(combined, tokenizer, max_tokens, overlap)

            for local_cid, ch in enumerate(section_chunks):
                records.append({
                    'doc_id': doc_id,
                    'dataset': dataset,
                    'split': split,
                    "jurisdiction": jurisdiction,
                    "title": title,
                    "section_id": section_idx,
                    "section_title": section_title,
                    'section_chunk_id': local_cid,
                    "chunk_id": global_chunk_id,
                    "chunk_text": ch["chunk_text"],
                    "start_token": ch["start_token"],
                    "end_token": ch["end_token"],
                    "target_summary": target_summary,
                })
                global_chunk_id += 1

    return pd.DataFrame(records)

billsum_docs = all_data_df[all_data_df["dataset"] == "billsum"].copy()
t5_tokenizer = AutoTokenizer.from_pretrained("t5-base")

billsum_section_chunks = make_billsum_chunk(billsum_docs, tokenizer=t5_tokenizer, max_tokens=512, overlap=128)
billsum_section_chunks.head()

Unnamed: 0,doc_id,dataset,split,jurisdiction,title,section_id,section_title,section_chunk_id,chunk_id,chunk_text,start_token,end_token,target_summary
0,billsum_train_0,billsum,train,US,A bill to limit the civil liability of busines...,0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,0,0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,0,512,Shields a business entity from civil liability...
1,billsum_train_0,billsum,train,US,A bill to limit the civil liability of busines...,0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,1,1,", territory, or possession. (b) Limitation on ...",384,896,Shields a business entity from civil liability...
2,billsum_train_0,billsum,train,US,A bill to limit the civil liability of busines...,0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,2,2,In general.--Subject to paragraph (2) and subs...,768,983,Shields a business entity from civil liability...
3,billsum_train_1,billsum,train,US,Human Rights Information Act,0,SECTION 1. SHORT TITLE.,0,0,SECTION 1. SHORT TITLE. This Act may be cited ...,0,30,Human Rights Information Act - Requires certai...
4,billsum_train_1,billsum,train,US,Human Rights Information Act,1,SEC. 2. FINDINGS.,0,1,SEC. 2. FINDINGS. Congress finds the following...,0,512,Human Rights Information Act - Requires certai...


In [37]:
billsum_section_chunks.to_parquet("billsum_section_chunks.parquet")

In [38]:
billsum_section_chunks.columns

Index(['doc_id', 'dataset', 'split', 'jurisdiction', 'title', 'section_id',
       'section_title', 'section_chunk_id', 'chunk_id', 'chunk_text',
       'start_token', 'end_token', 'target_summary'],
      dtype='object')

In [39]:
govreport_chunks.columns

Index(['doc_id', 'dataset', 'split', 'jurisdiction', 'title', 'chunk_id',
       'chunk_text', 'start_token', 'end_token', 'target_summary'],
      dtype='object')

#### Start from here

In [21]:
billsum_section_chunks = pd.read_parquet("billsum_section_chunks.parquet")
govreport_chunks = pd.read_parquet("govreport_chunks.parquet")

In [22]:
govreport_chunks['section_id'] = -1
govreport_chunks['section_title'] = 'FULL TEXT'
govreport_chunks['section_chunk_id'] = -1

billsum_section_chunks.shape[1] == govreport_chunks.shape[1]

True

In [23]:
### no need to run this again
# all_chunks = pd.concat([billsum_section_chunks, govreport_chunks], ignore_index=True)
# all_chunks.to_parquet("all_section_chunks.parquet")

all_chunks = pd.read_parquet("all_section_chunks.parquet")
all_chunks.shape

(395014, 13)

#### Not running retrievers again, we have the embeddings

In [43]:
# base retriever

from sklearn.metrics.pairwise import linear_kernel

billsum_vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),
    lowercase=True,
)

billsum_tfidf = billsum_vectorizer.fit_transform(billsum_section_chunks['chunk_text'])

def retrieve_billsum_section_chunks(query, top_k=5):
    q_vec = billsum_vectorizer.transform([query])
    scores = linear_kernel(q_vec, billsum_tfidf).flatten()
    top_idx = scores.argsort()[::-1][:top_k]
    return billsum_section_chunks.iloc[top_idx].assign(score=scores[top_idx])

gov_vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),
    lowercase=True,
)

gov_tfidf = gov_vectorizer.fit_transform(govreport_chunks['chunk_text'])

def retrieve_gov_section_chunks(query, top_k=5):
    q_vec = gov_vectorizer.transform([query])
    scores = linear_kernel(q_vec, gov_tfidf).flatten()
    top_idx = scores.argsort()[::-1][:top_k]
    return govreport_chunks.iloc[top_idx].assign(score=scores[top_idx])

In [48]:
test_billsum = retrieve_billsum_section_chunks("liability of business entities providing facilities to nonprofits", top_k=3)[
    ['doc_id', 'chunk_id', 'section_id', 'section_chunk_id', 'score', 'chunk_text']
]

print(test_billsum.iloc[0]['chunk_text'])
print(test_billsum.iloc[1]['chunk_text'])

public benefit and operated primarily for charitable, civic, educational, religious, welfare, or health purposes. (9) State.--The term State'' means each of the several States, the District of Columbia, the Commonwealth of Puerto Rico, the Virgin Islands, Guam, American Samoa, the Northern Mariana Islands, any other territory or possession of the United States, or any political subdivision of any such State, territory, or possession. (b) Liability.-- (1) Liability of business entities that donate equipment to nonprofit organizations.-- (A) In general.--Subject to subsection (c), a business entity shall not be subject to civil liability relating to any injury or death that results from the use of equipment donated by a business entity to a nonprofit organization. (B) Application.--This paragraph shall apply with respect to civil liability under Federal and State law. (2) Liability of business entities providing use of facilities to nonprofit organizations.-- (A) In general.--Subject to 

In [49]:
test_govreport = retrieve_gov_section_chunks("income derived from private sector business activity", top_k=3)[
    ['doc_id', 'chunk_id', 'score', 'chunk_text']
]

print(test_govreport.iloc[0]['chunk_text'])
print(test_govreport.iloc[1]['chunk_text'])

 workgroup, one of six workgroups formed since its implementation status survey, to consider these issues. According to our interviews with private-sector representatives and private-sector information from national associations, additional structures that have been developed to accomplish many of the day-to-day board activities may not reflect or may dilute employer’s input into the system. Virtually every state and local board has assigned staff that is responsible for carrying out much of the detail associated with the board operations, such as setting up meetings, developing the agenda, and ensuring that boards stay current with compliance issues. Private-sector representatives were concerned, however, that the staff may lack knowledge of or interest in the needs of the private sector. According to private-sector representatives and other implementers, staff are often employed by the public-sector agency responsible for carrying out WIA’s Adult, Dislocated Worker and other mandator

In [51]:
# ooook so far so good. gov report did not retrieve very well on income derived but got the private sector business activity part

from sentence_transformers import SentenceTransformer, util
import torch

st_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')





To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [52]:
billsum_texts = billsum_section_chunks['chunk_text'].tolist()
billsum_embeddings = st_model.encode(billsum_texts, batch_size=32, convert_to_tensor=True, show_progress_bar=True, device='cuda')

gov_texts = govreport_chunks['chunk_text'].tolist()
gov_embeddings = st_model.encode(gov_texts, batch_size=32, convert_to_tensor=True, show_progress_bar=True, device='cuda')

Batches: 100%|██████████| 5601/5601 [35:46<00:00,  2.61it/s]
Batches: 100%|██████████| 6744/6744 [59:35<00:00,  1.89it/s] 


In [None]:
torch.save(billsum_embeddings, "billsum_embeddings.pt")
torch.save(gov_embeddings, "govreport_embeddings.pt")

all_embs = torch.cat([billsum_embeddings, gov_embeddings], dim=0)
print(all_embs.shape)

torch.save(all_embs, "all_embeddings.pt")

#### Start from here

In [24]:
all_chunks = pd.read_parquet("all_section_chunks.parquet")
all_embs = torch.load("all_embeddings.pt")

print(all_embs.shape)
print(all_chunks.shape)

torch.Size([395014, 768])
(395014, 13)


In [27]:
from sentence_transformers import SentenceTransformer, util
import torch

st_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

def retrieve_billsum_st(query, top_k=5):
    query_embedding = st_model.encode(query, convert_to_tensor=True, device='cuda')
    cos_scores = util.pytorch_cos_sim(query_embedding, billsum_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    indices = top_results[1].cpu().numpy()
    scores = top_results[0].cpu().numpy()

    results = billsum_section_chunks.iloc[indices].copy()
    results['score'] = scores
    return results

def retrieve_gov_st(query, top_k=5):
    query_embedding = st_model.encode(query, convert_to_tensor=True, device='cuda')
    cos_scores = util.pytorch_cos_sim(query_embedding, gov_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    indices = top_results[1].cpu().numpy()
    scores = top_results[0].cpu().numpy()

    results = govreport_chunks.iloc[indices].copy()
    results['score'] = scores
    return results







In [28]:
gov_embeddings = torch.load("govreport_embeddings.pt")
billsum_embeddings = torch.load("billsum_embeddings.pt")

test_billsum = retrieve_billsum_st("liability of business entities providing facilities to nonprofits", top_k=3)[
    ['doc_id', 'chunk_id', 'section_id', 'section_chunk_id', 'score', 'chunk_text']
]

test_govreport = retrieve_gov_st("income derived from private sector business activity", top_k=3)[
    ['doc_id', 'chunk_id', 'score', 'chunk_text']
]

In [29]:
test_billsum

Unnamed: 0,doc_id,chunk_id,section_id,section_chunk_id,score,chunk_text
8795,billsum_train_1123,0,0,0,0.865064,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...
0,billsum_train_0,0,0,0,0.865064,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...
46860,billsum_train_6083,1,0,1,0.719623,public benefit and operated primarily for char...


In [30]:
test_govreport

Unnamed: 0,doc_id,chunk_id,score,chunk_text
150368,govreport_train_13623,1,0.608806,noncorporate businesses in this report. Nonco...
16842,govreport_train_1551,11,0.590817,overall pilot and pilot activities. We conduc...
179083,govreport_train_16153,3,0.577321,",000 accounted for 12.6%. For partnerships and..."


#### Just rewriting for better flow

In [31]:
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

emb_model_name = 'sentence-transformers/all-mpnet-base-v2'
emb_tokenizer = AutoTokenizer.from_pretrained(emb_model_name)
emb_model = AutoModel.from_pretrained(emb_model_name).to('cuda')

emb_model.eval()

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [32]:
all_embs = all_embs.to('cuda')

def mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    masked = last_hidden_state * mask
    summed = torch.sum(masked, dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts

@torch.no_grad()
def encode_query(query):
    enc = emb_tokenizer(
        query,
        padding=True,
        return_tensors='pt',
        truncation=True,
        max_length=512,
    ).to('cuda')
    outputs = emb_model(**enc)
    q_emb = mean_pool(outputs.last_hidden_state, enc['attention_mask'])[0]

    return q_emb

@torch.no_grad()
def retrieve_all(query, top_k=5):
    q_emb = encode_query(query)
    # all_embs_cpu = all_embs.cpu()
    sims = F.cosine_similarity(q_emb.unsqueeze(0), all_embs, dim=1)

    top_k = min(top_k, len(sims))
    top_scores, top_idx = torch.topk(sims, k=top_k)

    top_idx = top_idx.cpu().numpy()
    top_scores = top_scores.cpu().numpy()

    return all_chunks.iloc[top_idx].assign(score=top_scores)

In [33]:
test_queries = [
    "liability of business entities providing facilities to nonprofits",
    "income derived from private sector business activity",
]

for q in test_queries:
    results = retrieve_all(q, top_k=3)
    print(f"Results for query: {q}")
    print(results[['dataset','doc_id', 'chunk_id', 'section_id', 'section_chunk_id', 'score', 'chunk_text']])

Results for query: liability of business entities providing facilities to nonprofits
       dataset              doc_id  chunk_id  section_id  section_chunk_id  \
8795   billsum  billsum_train_1123         0           0                 0   
0      billsum     billsum_train_0         0           0                 0   
46860  billsum  billsum_train_6083         1           0                 1   

          score                                         chunk_text  
8795   0.865064  SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...  
0      0.865064  SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...  
46860  0.719623  public benefit and operated primarily for char...  
Results for query: income derived from private sector business activity
          dataset                 doc_id  chunk_id  section_id  \
329599  govreport  govreport_train_13623         1          -1   
196073  govreport   govreport_train_1551        11          -1   
358314  govreport  govreport_train_16153         3      

In [1]:
# ok, all embeddings works too. but it is heavy!

from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

qa_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad").to('cuda')
qa_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")

qa_pipeline = pipeline(
    "question-answering",
    model=qa_model,
    tokenizer=qa_tokenizer,
    device=0
)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


In [109]:
billsum_mask = dict(zip(billsum_docs['doc_id'], billsum_docs['source_text']))

In [114]:

def get_neighbor_chunk(doc_id, chunk_id, offset):
    neighbor_id = chunk_id + offset
    neighbor = all_chunks[
        (all_chunks['doc_id'] == doc_id) &
        (all_chunks['chunk_id'] == neighbor_id)
    ]
    if len(neighbor) == 0:
        return None
    return neighbor.iloc[0]

def answer_qa_window(query, top_k=5, window_size=300):
    retrieved = retrieve_all(query, top_k=top_k)
    
    if retrieved.empty:
        return None
    
    best_result = None
    best_score = float('-inf')
    best_row = None

    for _, row in retrieved.iterrows():
        context = row['chunk_text']
        result = qa_pipeline(question=query, context=context)

        if result['score'] > best_score:
            best_result = result
            best_score = result['score']
            best_row = row
    
    if best_result is None or best_row is None:
        return None
    
    dataset = best_row.get('dataset')

    if (dataset == 'billsum') and pd.notna(best_row.get('section_id')) and best_row.get('section_title') != 'FULL TEXT':
        # skip window expansion for sectioned billsum chunks for now
        full_section = all_chunks[
            (all_chunks['doc_id'] == best_row['doc_id']) &
            (all_chunks['section_id'] == best_row['section_id'])
        ].sort_values(['section_chunk_id', 'chunk_id'])
        min_start = full_section['start_token'].min()
        max_end = full_section['end_token'].max()
        full_text = billsum_mask.get(best_row['doc_id'], "")
        print(full_text)
        min_start = max(0, min_start)
        max_end = min(len(full_text), max_end)

        section_ids = full_text[min_start:max_end]
        # full_section_text = t5_tokenizer.decode(section_ids, skip_special_tokens=True)
        clause_window = full_text[min_start:max_end]
        return {
            'question': query,
            'answer_span': best_result['answer'],
            'score': best_result['score'],
            'start': best_result['start'],
            'end': best_result['end'],
            'clause_window': clause_window,
            'full_chunk_text': best_row['chunk_text'],
            'dataset': best_row.get('dataset'),
            'doc_id': best_row.get('doc_id'),
            'chunk_id': best_row.get('chunk_id'),
            'section_id': best_row.get('section_id', None),
            'section_chunk_id': best_row.get('section_chunk_id', None),
        }
    chunk_text = best_row['chunk_text']
    start = best_result['start']
    end = best_result['end']

    start_window = max(0, start - window_size)
    end_window = min(len(chunk_text), end + window_size)

    left_text = chunk_text[start_window:end_window]
    total_len = len(left_text)

    if start_window == 0:
        prev_row = get_neighbor_chunk(best_row['doc_id'], best_row['chunk_id'], -1)
        if prev_row is not None:
            prev_text = prev_row['chunk_text']
            need = window_size - (start)
            if need > 0:
                prefix = prev_text[max(0, len(prev_text)-need):]
                left_text = prefix + left_text

    if end_window == len(chunk_text):
        next_row = get_neighbor_chunk(best_row['doc_id'], best_row['chunk_id'], 1)
        if next_row is not None:
            next_text = next_row['chunk_text']
            need = window_size - (len(chunk_text) - start_window - (end - start))
            if need > 0:
                suffix = next_text[:max(0, need)]
                left_text = left_text + suffix

            

    clause_window = left_text

    return {
        'question': query,
        'answer_span': best_result['answer'],
        'score': best_result['score'],
        'start': start,
        'end': end,
        'clause_window': clause_window,
        'full_chunk_text': chunk_text,
        'dataset': best_row.get('dataset'),
        'doc_id': best_row.get('doc_id'),
        'chunk_id': best_row.get('chunk_id'),
        'section_id': best_row.get('section_id', None),
        'section_chunk_id': best_row.get('section_chunk_id', None),
    }

In [115]:
res = answer_qa_window(
    "What are the regulations regarding liability of business entities providing facilities to nonprofits?",
    top_k=5,
    window_size=600
)

print('Answer Span:', res['answer_span'])
print('\nWindow\n', res['clause_window'])
print('\nFrom:', res['dataset'], res['doc_id'], 'chunk', res['chunk_id'])

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [81]:
res = answer_qa_window(
    'What is the information related to income derived from private sector business activity?',
    top_k=5,
    window_size=600
)

print('Answer Span:', res['answer_span'])
print('\nWindow\n', res['clause_window'])
print('\nFrom:', res['dataset'], res['doc_id'], 'chunk', res['chunk_id'])

Answer Span: production activities deduction compared to business income reported in individual tax return

Window
 and 33.83%, with an overall rate of 34.14%. That is, the production activities deduction appears to reduce statutory tax rates by slightly under 1 percentage point. The estimated statutory tax rate for unincorporated business income, using the public use file data, is 27%. Using the aggregate for individual returns and distributing it in the same proportions, the estimated tax rates for the three categories, including the production activities deduction, are 26.74%, 26.87%, and 26.56%, for an overall rate of 26.83%, a reduction of less than 0.2 percentage points. This estimate is based on the production activities deduction compared to business income reported in individual tax return. The marginal rates for interest, dividends, and capital gains, respectively, are 22%, 14.6% and 15.4%. A large fraction of interest and dividends paid do not appear on individual income tax

In [None]:
prompt = 'Assume the role of a legal assistant to rewrite the following legal clause in clear, plain English. Preserve all legal conditions and do not add or remove any requirements.\n\n'
prompt += res['question'] + '\n\n'
prompt += res['clause_window'] + '\n\nPlain English version:\n'

answer = get_abstractive_summary(
    prompt,
    max_input_tokens=1024,
    max_output_tokens=256,
)

print("Rewritten Clause:\n", answer)

Rewritten Clause:
 The estimated statutory tax rate for unincorporated business income, using the public use file data, is 27%. Using the aggregate for individual returns and distributing it in the same proportions, the estimated tax rates for interest, dividends, and capital gains, respectively, are 22%, 14.6% and 15.4%.


In [101]:
question = 'What are the regulations regarding liability of business entities providing facilities to nonprofits?'

def answer_question_final(query, top_k=5, window_size=600, max_input_tokens=1024, max_output_tokens=256):
    res = answer_qa_window(
        query,
        top_k=top_k,
        window_size=window_size
    )

    print(res)
    prompt = 'Assume the role of a legal assistant to rewrite the following legal clause in clear, plain English. Preserve all legal conditions and do not add or remove any requirements.\n\n'
    prompt += res['question'] + '\n\n'
    prompt += res['clause_window'] + '\n\nPlain English version:\n'
    print(prompt)

    answer = get_abstractive_summary(
        prompt,
        max_input_tokens=max_input_tokens,
        max_output_tokens=max_output_tokens,
    )
    print(answer)
    return answer

final_answer = answer_question_final(
    question,
    top_k=5,
    window_size=600,
    max_input_tokens=1024,
    max_output_tokens=256,
)



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [90]:
question = 'What talks about international protection and promotion of human rights?'

final_answer = answer_question_final(
    question,
    top_k=5,
    window_size=600,
    max_input_tokens=4096,
    max_output_tokens=256,
)

{'question': 'What talks about international protection and promotion of human rights?', 'answer_span': 'U.N. World Summit', 'score': 0.28090273402631283, 'start': 1614, 'end': 1631, 'clause_window': 'as protected by the doctrine of non-intervention and state obligations to protect human rights and fundamental freedoms. Increasingly, protection of populations affected by conflict within a country is seen as partly the responsibility of the international community. For example, some observers have more recently argued that the DPRK government is a threat to its own people and that North Korea has violated its responsibility to protect its own citizens from crimes against humanity. They suggest that action by the international community and the U.N. Security Council is warranted. At the 2005 U.N. World Summit, the "Responsibility to Protect" was introduced, putting forward the idea that each state has a responsibility to protect its people from genocide, war crimes, ethnic cleansing and 

In [91]:
test = answer_qa_window(
    question,
    top_k=5,
    window_size=600
)

print('Answer Span:', test['answer_span'])
print('\nWindow\n', test['clause_window'])
print('\nFrom:', test['dataset'], test['doc_id'], 'chunk', test['chunk_id'])

Answer Span: U.N. World Summit

Window
 as protected by the doctrine of non-intervention and state obligations to protect human rights and fundamental freedoms. Increasingly, protection of populations affected by conflict within a country is seen as partly the responsibility of the international community. For example, some observers have more recently argued that the DPRK government is a threat to its own people and that North Korea has violated its responsibility to protect its own citizens from crimes against humanity. They suggest that action by the international community and the U.N. Security Council is warranted. At the 2005 U.N. World Summit, the "Responsibility to Protect" was introduced, putting forward the idea that each state has a responsibility to protect its people from genocide, war crimes, ethnic cleansing and crimes against humanity and that human rights violations committed in one state are the concern of all states. It is an agreement in principle that speaks to the