# LEGISLATION COMPARISON TOOL
## COMPARING STATE BILLS TO EACH OTHER— AND TO FEDERAL, LOBBYING, and INDUSTRY DOCUMENTS 

In [2]:
# pip install pymupdf tqdm


In [3]:
# pip install sentence-transformers

In [10]:
##import tools/libraries
import fitz
import re
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import glob
import os


In [53]:
##create directories to docs
PDF_DIR = Path("PDFS/") ##all documents, all categories of documents
TXT_DIR = Path("textfiles/") ##where they will go as .txt
TXT_DIR.mkdir(exist_ok=True)


## Create functions to: clean all documents, convert from pdf to txt

In [24]:
##create functions to clean

def clean_text(text):
    # lowercase
    text = text.lower()

    # remove multiple spaces/newlines
    text = re.sub(r'\s+', ' ', text)

    # remove page numbers like "Page 3"
    text = re.sub(r'page\s*\d+', ' ', text)

    # remove standalone citations like [1], (a), (b)(3)
    text = re.sub(r'\[\d+\]', ' ', text)
    text = re.sub(r'\(\w\)', ' ', text)

    # remove section numbers like 12004(b)(3)
    text = re.sub(r'\d+\([a-z]\)(\(\d+\))?', ' ', text)

    return text.strip()


def pdf_to_clean_txt(pdf_path, txt_path):
    doc = fitz.open(pdf_path)
    full_text = ""

    for page in doc:
        full_text += page.get_text()

    cleaned = clean_text(full_text)

    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(cleaned)



In [26]:
# Run conversion on every PDF (all docs in one folder)
for pdf_file in tqdm(PDF_DIR.glob("*.pdf")):
    txt_file = TXT_DIR / (pdf_file.stem + ".txt")
    pdf_to_clean_txt(pdf_file, txt_file)

61it [00:06,  9.62it/s]


## Create Embeddings
#### NOTE: this step can change based on number/type/category of legal documents being compared
-for separate states, create one embedding for each document. each will become a .npy \
-for lobbying or industry documents that don't need to be delineated, keep as one .npy \
-"bwang0911/jev2-legal" model is trained on legal documents in the US

In [57]:

model = SentenceTransformer("bwang0911/jev2-legal")

def embed_document(txt_path):
    text = txt_path.read_text(encoding="utf-8")
    embedding = model.encode(text, convert_to_numpy=True)
    return embedding

##embed state documents separately into state_embeddings folder
state_txt_files = list(TXT_DIR.glob("state_*.txt"))
for txt_file in tqdm(state_txt_files, desc="Embedding state documents"):
    emb = embed_document(txt_file)
    np.save(STATE_EMB_DIR / (txt_file.stem + ".npy"), emb)

##embed federal, industry, and lobbying documents into main embeddings folder
other_txt_files = [f for f in TXT_DIR.glob("*.txt") if not f.stem.startswith("state_")]
for txt_file in tqdm(other_txt_files, desc="Embedding other documents"):
    emb = embed_document(txt_file)
    np.save(EMB_DIR / (txt_file.stem + ".npy"), emb)

print(f"Created {len(state_txt_files)} state embeddings and {len(other_txt_files)} other embeddings")

Embedding state documents: 0it [00:00, ?it/s]
Embedding other documents: 0it [00:00, ?it/s]

Created 0 state embeddings and 0 other embeddings





## NAMING: 
#### Make sure all state docs in the folder with "state_" before name, federal provision bill language with "fed_", lobbying group as "lobbying_", rename industry doc as "industry_"

In [37]:
# Get all state embedding files
state_files = glob.glob("state_embeddings/state_*.npy")
state_files.sort()  # Consistent ordering

# Create a mapping dictionary: {filename: embedding}
state_embeddings_dict = {}
for file in state_files:
    name = Path(file).stem  # e.g., "state_ALABAMA_narrative"
    state_embeddings_dict[name] = np.load(file)


lobbying_files = glob.glob("embeddings/lobbying_*.npy")
lobbying_embeddings = np.vstack([np.load(f) for f in lobbying_files])
np.save("embeddings/lobbying.npy", lobbying_embeddings)

print(f"Loaded {len(state_embeddings_dict)} state documents")


Loaded 51 state documents


## Load the rest of the embeddings
NOTE: the two if statements should be implemented if any of the category folders have only one document.

In [16]:
##load and compare embeddings


##load federal, industry, and the stacked lobbying embeddings for comparison
fed_embedding = np.load("embeddings/fed_RHTP.npy")
industry_embedding = np.load("embeddings/industry_biointel.npy")
lobby_embeddings = np.load("embeddings/lobbying.npy")

##reshape the two single-file .npys so they are 2D matrices
if fed_embedding.ndim == 1:
    fed_embedding = fed_embedding.reshape(1, -1)
    
if industry_embedding.ndim == 1:
    industry_embedding = industry_embedding.reshape(1, -1)

print(f"Loaded federal, industry, and {lobby_embeddings.shape[0]} lobbying documents")


Loaded federal, industry, and 3 lobbying documents


## Create a multi-function that creates a readable output:
1. Take one state's documents and compare it to other states, federal, industry, lobbying docs
3. Format the output to show percentages for federal, industry, and lobbying documents
4. Show the top 5 most similar states and 5 least similar states with their similarity percentages

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
##compare a specific state doc to everything else
def compare_state_doc(state_doc_name):
    """
    Compare a specific state document to federal, industry, lobbying, and other states
    
    Parameters:
    - state_doc_name: e.g., "state_ALABAMA_narrative" or "state_TEXAS_budget"
    """
    if state_doc_name not in state_embeddings_dict:
        print(f"Document '{state_doc_name}' not found")
        return None
    
    state_emb = state_embeddings_dict[state_doc_name].reshape(1, -1)
    
    # Compare to other documents
    results = {
        "document": state_doc_name,
        "to_federal": float(cosine_similarity(state_emb, fed_embedding)[0][0]),
        "to_industry": float(cosine_similarity(state_emb, industry_embedding)[0][0]),
        "to_lobbying": cosine_similarity(state_emb, lobby_embeddings)[0],
        "to_other_states": {}
    }
    
    # Compare to all other state documents
    for other_name, other_emb in state_embeddings_dict.items():
        if other_name != state_doc_name:
            other_emb = other_emb.reshape(1, -1)
            sim = float(cosine_similarity(state_emb, other_emb)[0][0])
            results["to_other_states"][other_name] = sim
    
    return results

def list_state_docs(state_name=None):
    """List all documents, optionally filtered by state name"""
    if state_name:
        state_name = state_name.upper()
        return [name for name in state_embeddings_dict.keys() if state_name in name]
    return list(state_embeddings_dict.keys())

def compare_all_docs_for_state(state_name):
    """Compare all documents for a specific state"""
    docs = list_state_docs(state_name)
    results = {}
    for doc in docs:
        results[doc] = compare_state_doc(doc)
    return results

## RUN IT

In [25]:
# Test the function on a state's documents
print("\nAll Alabama documents:")
print(list_state_docs("ALABAMA"))

print("\nComparing Alabama narrative document:")
result = compare_state_doc("state_ALABAMA_narrative")
if result:
    print(f"  Federal similarity: {result['to_federal']*100:.1f}%")
    print(f"  Industry similarity: {result['to_industry']*100:.1f}%")
    
    # Show top 5 most similar other state docs
    sorted_states = sorted(result['to_other_states'].items(), 
                          key=lambda x: x[1], reverse=True)
    print("\n  Top 5 most similar state documents:")
    for doc, sim in sorted_states[:5]:
        print(f"    {doc}: {sim*100:.1f}%")


All Alabama documents:
['state_ALABAMA_narrative']

Comparing Alabama narrative document:
  Federal similarity: 70.5%
  Industry similarity: 59.5%

  Top 5 most similar state documents:
    state_ALASKA_initiatives: 82.3%
    state_COLORADO_summary: 82.3%
    state_COLORADO_budget: 81.4%
    state_OREGON_narrative: 81.3%
    state_ALASKA_summary: 81.1%


## CREATE SECOND FUNCTION THAT COMPARES STATE TO STATE

In [27]:
##compare all documents from one state to another state's documents
def compare_state_to_state(state1_name, state2_name):
    """
    Compare all documents from state1 to all documents from state2
    Returns a matrix showing which document types are most similar
    """
    state1_docs = list_state_docs(state1_name)
    state2_docs = list_state_docs(state2_name)
    
    print(f"\nComparing {state1_name} to {state2_name}:")
    print("-" * 60)
    
    for doc1 in state1_docs:
        emb1 = state_embeddings_dict[doc1].reshape(1, -1)
        print(f"\n{doc1}:")
        
        similarities = []
        for doc2 in state2_docs:
            emb2 = state_embeddings_dict[doc2].reshape(1, -1)
            sim = float(cosine_similarity(emb1, emb2)[0][0])
            similarities.append((doc2, sim))
        
        # Sort by similarity
        similarities.sort(key=lambda x: x[1], reverse=True)
        for doc2, sim in similarities:
            print(f"  → {doc2}: {sim*100:.1f}%")

## RUN SECOND

In [30]:
compare_state_to_state("CALIFORNIA", "TEXAS")


Comparing CALIFORNIA to TEXAS:
------------------------------------------------------------

state_CALIFORNIA_narrative:
  → state_TEXAS_narrative: 80.1%
  → state_TEXAS_supplement: 74.7%

state_CALIFORNIA_support:
  → state_TEXAS_narrative: 73.5%
  → state_TEXAS_supplement: 70.9%


In [43]:
## can go further using clustering to visually represent all documents and where they lie in relation to one another.