Convert PDFs to text

In [12]:
# %pip install PyPDF2
# %pip install openpyxl==3.0.10
# %pip install PyMuPDF pandas
# %pip install pdf2image pytesseract pillow
# %pip install nltk
# %pip install spacy
# %pip install transformers==4.36.2
# %pip install --upgrade sentence-transformers

# %pip install chromadb sentence-transformers



In [None]:
#Core
import os
import json
import pandas as pd
import re #regular expression


#OCR
from pdf2image import convert_from_path
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
import fitz  # PyMuPDF

# Tokenizers
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import spacy
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 3500000

# Embeddings
import chromadb 
from chromadb.config import Settings 
from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lwert\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def ocr_on_pdfs(pdf_dir):
    #identify the dataset
    pdf_data = []

    # Loop through all PDF files in the directory
    for filename in os.listdir(pdf_dir):
        if filename.endswith('.pdf'):
            file_path = os.path.join(pdf_dir, filename)
            print("working on:", file_path)

            try:

                #convert file to image
                images = convert_from_path(file_path, poppler_path=r"C:\poppler-24.08.0\Library\bin")
                # Extract the text from all pages
                text = []
                for img in images:
                    text.append(pytesseract.image_to_string(img))

                    # Get PDF metadata
                    # Open the PDF file
                doc = fitz.open(file_path)
                metadata = doc.metadata
                page_count = doc.page_count


                # Create a dictionary to convert to dataframe
                pdf_data.append({
                    'filename': filename,
                    'text': text,
                    'page_count': page_count,
                    'creator': metadata.get('creator'),
                    'producer': metadata.get('producer'),
                    'subject': metadata.get('subject'),
                    'keywords': metadata.get('keywords'),
                    'title': metadata.get('title'),
                    'creation_date': metadata.get('creationDate'),
                })

                doc.close()

            except Exception as e:
                print(f"OCR error on {filename}: {e}")

    return pdf_data



In [None]:
#pull pdf data and run through ocr
pdf_dir = "pdf_data"
# pdfs = ocr_on_pdfs(pdf_dir)

# Save to JSON
# with open('legal_pdf_data.json', 'w', encoding='utf-8') as f:
#     json.dump(pdfs, f, ensure_ascii=False, indent=2)

# Convert returned dictionary to DataFrame and save
# df = pd.DataFrame(pdfs)
# df.to_csv('legal_pdfs_dataset.csv', index=False)
#### df.to_json('legal_pdf_data.json', orient='records', force_ascii=False, indent=2)



working on: pdf_data\ARG-1472-D-2023.pdf
working on: pdf_data\CA-2885.pdf
working on: pdf_data\CA-AB2013.pdf
working on: pdf_data\CA-AJR6.pdf
working on: pdf_data\CA-SB1047.pdf
working on: pdf_data\CA-SB942.pdf
working on: pdf_data\CO-CAIA.pdf
working on: pdf_data\DE-H333.pdf
working on: pdf_data\EU-2024-1689.pdf
OCR error on EU-2024-1689.pdf: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

working on: pdf_data\FL-S1680.pdf
working on: pdf_data\IL-H4705.pdf
working on: pdf_data\IL-H4836.pdf
working on: pdf_data\IL-H4844.pdf
working on: pdf_data\IN-AI_task_Force.pdf
working on: pdf_data\MD-818.pdf
working on: pdf_data\MEX.pdf
working on: pdf_data\NH_H1688.pdf
working on: pdf_data\NJ-S3357.pdf
working on: pdf_data\NY-A8129.pdf
working on: pdf_data\OJ_L_202401689_EN_TXT.pdf
working on: pdf_data\OR-H4153.pdf
working on: pdf_data\PA-H49.pdf
working on: pdf_data\PA-HR170.pdf
workin

### Tokenize the text

Look at: https://github.com/lwdozal/Chat-with-your-Research-Articles-LLM-Retrieval-Augmented-Generation/blob/main/Intermediate%20RAG%20(prefilled).ipynb

Tokenize sentences into paragraphs ~5 sentences into each chunk 
Tokenize based on specific legal clauses

In [None]:
# crete function to split text based on specific legal clauses
def split_legal_clauses(text, legal_clauses):
    # set-up splitter to Find the clauses in the text
    clause_splitter = re.compile(r'(?=({}))'.format('|'.join(legal_clauses)), re.IGNORECASE)

    # Split based on legal clause words
    clauses = clause_splitter.split(text)
    
    # Clean up chunks and stitch words to their content
    refined_clauses = []
    i = 0
    while i < len(clauses):
        if clause_splitter.match(clauses[i]):
            # Pair clause header with its content
            if i + 1 < len(clauses):
                refined_clauses.append((clauses[i] + ' ' + clauses[i+1]).strip())
                i += 2
            else:
                refined_clauses.append(clauses[i].strip())
                i += 1
        else:
            if clauses[i].strip():
                refined_clauses.append(clauses[i].strip())
            i += 1

    return refined_clauses

In [None]:
# set up function to split text into sentences, paragraphs, and clauses
def create_split_text(text, legal_clauses, para_size=5, overlap=2):
    """
    Convert flat text into overlapping paragraphs of sentences.
    - para_size: number of sentences per paragraph
    - overlap: number of sentences to overlap between paragraphs
    """
    #Tokenize into sentences
    sentences = sent_tokenize(text)
    # split text based on clauses
    clauses = split_legal_clauses(text, legal_clauses)

    # split text to create overlapping paragraphs
    paragraphs = []
    i = 0
    while i < len(sentences):
        para = sentences[i:i+para_size]
        if para:
            paragraphs.append(' '.join(para))
        i += para_size - overlap  # Slide forward by (para_size - overlap)

    return paragraphs, sentences, clauses

In [None]:
#read in Json of pdf texts
with open('legal_pdf_data.json', 'r', encoding='utf-8') as f:
    pdf_data = json.load(f)

In [47]:
# pdf_data

# Types of legal clause headers
LEGAL_CLAUSES = [
    r'WHEREAS\b',
    r'NOW, THEREFORE\b',
    r'BE IT RESOLVED\b',
    r'IN WITNESS WHEREOF\b',
    r'THIS AGREEMENT\b',
    r'FOR THE AVOIDANCE OF DOUBT\b',
    r'SUBJECT TO\b',
    r'NOTWITHSTANDING\b'
]

#chunk data into paragraphs and sentences for best Summarization and RAG
for doc in pdf_data:
    print("Working on:", doc["filename"])
    print("Number of words:", len(parsed.text))

    text = doc['text']
    text = " ".join(text)
    joined_text = text.replace("\n"," ")
    joined_text = joined_text.replace("\n\n"," ")
    parsed = nlp(text)
    # print(parsed.text)
    
    # split the text into sentences, identified clauses, and sentences
    paragraphs, sentences, clauses = create_split_text(parsed.text, LEGAL_CLAUSES, para_size=8, overlap=3)

    doc['paragraphs'] = paragraphs
    doc['sentences'] = sentences
    doc['claues'] = clauses

with open('legal_pdf_tokenized.json', 'w', encoding='utf-8') as f:
    json.dump(pdf_data, f, ensure_ascii=False, indent=2)

Working on: ARG-1472-D-2023.pdf
Number of words: 5432
Working on: CA-2885.pdf
Number of words: 8162
Working on: CA-AB2013.pdf
Number of words: 26023
Working on: CA-AJR6.pdf
Number of words: 7471
Working on: CA-SB1047.pdf
Number of words: 5339
Working on: CA-SB942.pdf
Number of words: 54031
Working on: CO-CAIA.pdf
Number of words: 11133
Working on: DE-H333.pdf
Number of words: 42107
Working on: FL-S1680.pdf
Number of words: 6812
Working on: IL-H4705.pdf
Number of words: 8586
Working on: IL-H4836.pdf
Number of words: 4333
Working on: IL-H4844.pdf
Number of words: 6078
Working on: IN-AI_task_Force.pdf
Number of words: 3030138
Working on: MD-818.pdf
Number of words: 18680
Working on: MEX.pdf
Number of words: 35879
Working on: NH_H1688.pdf
Number of words: 33008
Working on: NJ-S3357.pdf
Number of words: 7431
Working on: NY-A8129.pdf
Number of words: 7388
Working on: OJ_L_202401689_EN_TXT.pdf
Number of words: 16263
Working on: OR-H4153.pdf
Number of words: 597877
Working on: PA-H49.pdf
Numbe

### Create Text Embeddings

To start creating embeddings for RAG (This might be an easier start for us, and might still hold some buzzword hype)


In [8]:
#setup chromadb for chunk embeddings storage

chroma_client = chromadb.Client(Settings(
    persist_directory="chroma_store",  # Save to disk
    anonymized_telemetry=False
))

collection = chroma_client.get_or_create_collection("legal_ai_docs")

In [None]:
#read in Json of pdf text tokenization
with open('legal_pdf_tokenized.json', 'r', encoding='utf-8') as f:
    pdf_tokens = json.load(f)

#check the json was imported correctly
pdf_tokens

In [None]:
# download the sentence transformer model from huggingface and make sure it works
# model: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

model = SentenceTransformer("all-MiniLM-L6-v2")
print(model.encode(["This works!"]))

Embed the sentences based on preprocessed transformer learning


In [13]:

rag_chunks = []
ids, texts, metadatas = [], [], []

#start with paragraphs from each pdf
for pdf in pdf_tokens:
    filename = pdf['filename']
    paragraphs = pdf['paragraphs']
    print("working on", filename)

    #get each paragraph 
    for i, chunk in enumerate(paragraphs):
        chunk_id = f"{filename}_chunk{i}"
        ids.append(chunk_id)
        texts.append(chunk)
        metadatas.append({
            "source": filename,
            "chunk_index": i,
            "filepath": f"docs/{filename}",   # Optional: full local or cloud path

        })

        rag_chunks.append({
            "id": chunk_id,
            "text": chunk,
            "metadata": metadatas[-1]
        })

#save embeddings and metadata to the chromadb embeddings database
embeddings = model.encode(texts).tolist()

collection.add(
    documents=texts,
    metadatas=metadatas,
    ids=ids,
    embeddings=embeddings
)

working on ARG-1472-D-2023.pdf
working on CA-2885.pdf
working on CA-AB2013.pdf
working on CA-AJR6.pdf
working on CA-SB1047.pdf
working on CA-SB942.pdf
working on CO-CAIA.pdf
working on DE-H333.pdf
working on FL-S1680.pdf
working on IL-H4705.pdf
working on IL-H4836.pdf
working on IL-H4844.pdf
working on IN-AI_task_Force.pdf
working on MD-818.pdf
working on MEX.pdf
working on NH_H1688.pdf
working on NJ-S3357.pdf
working on NY-A8129.pdf
working on OJ_L_202401689_EN_TXT.pdf
working on OR-H4153.pdf
working on PA-H49.pdf
working on PA-HR170.pdf
working on RI-S117.pdf
working on TN-H2325.pdf
working on UK.pdf
working on UT-S149.pdf
working on UT-SB0149.pdf
working on VA-H747.pdf
working on VA-S487.pdf
working on VT-H710.pdf
working on WA-1168.pdf
working on WA-1170.pdf
working on WA-5838-S2.PL.pdf
working on WI-A664.pdf
working on WV-H5690.pdf


Insert of existing embedding ID: ARG-1472-D-2023.pdf_chunk0
Insert of existing embedding ID: ARG-1472-D-2023.pdf_chunk1
Insert of existing embedding ID: ARG-1472-D-2023.pdf_chunk2
Insert of existing embedding ID: ARG-1472-D-2023.pdf_chunk3
Insert of existing embedding ID: ARG-1472-D-2023.pdf_chunk4
Insert of existing embedding ID: ARG-1472-D-2023.pdf_chunk5
Insert of existing embedding ID: ARG-1472-D-2023.pdf_chunk6
Insert of existing embedding ID: CA-2885.pdf_chunk0
Insert of existing embedding ID: CA-2885.pdf_chunk1
Insert of existing embedding ID: CA-2885.pdf_chunk2
Insert of existing embedding ID: CA-2885.pdf_chunk3
Insert of existing embedding ID: CA-2885.pdf_chunk4
Insert of existing embedding ID: CA-2885.pdf_chunk5
Insert of existing embedding ID: CA-2885.pdf_chunk6
Insert of existing embedding ID: CA-2885.pdf_chunk7
Insert of existing embedding ID: CA-2885.pdf_chunk8
Insert of existing embedding ID: CA-2885.pdf_chunk9
Insert of existing embedding ID: CA-2885.pdf_chunk10
Insert 

We can query chromadb about our data

In [14]:
query = "What happens in the case of data reusability?"
query_embedding = model.encode([query]).tolist()[0]

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

# Gather the top chunks + metadata
retrieved_chunks = []
for doc_text, meta in zip(results["documents"][0], results["metadatas"][0]):
    citation_tag = f"[{meta['source']} chunk {meta['chunk_index']}]"
    retrieved_chunks.append(f"{citation_tag}\n{doc_text}")


In [22]:
def extract_citations(output):
    pattern = r'\[(.+?) chunk (\d+)\]'
    matches = re.findall(pattern, output)
    return [{"source": src, "chunk_index": int(idx)} for src, idx in matches]

In [16]:
print(retrieved_chunks)

['[CA-AB2013.pdf chunk 6]\n(9) Whether there was any cleaning, processing, or other modification to the datasets by the developer,\nincluding the intended purpose of those efforts in relation to the artificial intelligence system or service. (10) The time period during which the data in the datasets were collected, including a notice if the data\ncollection is ongoing. (11) The dates the datasets were first used during the development of the artificial intelligence system or\nservice. (12) Whether the generative artificial intelligence system or service used or continuously uses synthetic data\ngeneration in its development. A developer may include a description of the functional need or desired\npurpose of the synthetic data in relation to the intended purpose of the system or service. (b) A developer shall not be required to post documentation regarding the data used to train a generative artificial\nintelligence system or service for any of the following:\n\n(1) A generative artific

Now we can use RAG to query our documents and cite the results

In [19]:
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field


In [18]:
# Setup required parameters to use AI-verde's OpenAI-compatible API
model_name = "js2/Llama-3.3-70B-Instruct-FP8-Dynamic"
# llm_host = os.environ.get('OPENAI_API_BASE', "https://llm-api.cyverse.org/v1")
llm_host = os.environ.get('OPENAI_API_BASE', "https://llm-api.cyverse.ai")
api_key = os.environ.get('OPENAI_API_KEY', 'sk-cQwvetyweWlKpJ3NANfEng')

In [None]:
# directly using langchain ChatOpenAI
llm = ChatOpenAI(
    model=model_name,
    temperature=0.2, 
    max_tokens=1000,
    api_key=api_key,
    base_url=llm_host,
)

In [21]:
llm.invoke("Hello, how are you?") # validate we can talk with the LLM


AIMessage(content="Hello! I'm just a language model, so I don't have feelings or emotions like humans do, but I'm functioning properly and ready to help with any questions or tasks you may have. How about you? How's your day going so far?", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 52, 'prompt_tokens': 41, 'total_tokens': 93, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'Llama-3.3-70B-Instruct-FP8-Dynamic', 'system_fingerprint': None, 'id': 'chatcmpl-1cfe7781a83a47fca8b96f98abc23e4f', 'finish_reason': 'stop', 'logprobs': None}, id='run-f8d30082-7084-4955-942b-c741237f2132-0', usage_metadata={'input_tokens': 41, 'output_tokens': 52, 'total_tokens': 93, 'input_token_details': {}, 'output_token_details': {}})

In [None]:
#set up questioning for LLM

context = "\n\n".join(retrieved_chunks)

prompt = f"""
You are a legal assistant. Use the following context to answer the user's question. 
Always cite your sources in brackets like [contract1.pdf chunk 3].

Context:
{context}

User question:
{query}

Answer:
"""

In [None]:
response = llm.invoke([prompt])
print(response.content)

In [None]:
response_text = str(response)

In [33]:
citations = extract_citations(response_text)

for c in citations:
    print(f"- Source: {c['source']}, Chunk: {c['chunk_index']}")
    # Optional: link to PDF viewer
    path = f"/pdf_viewer/{c['source']}#chunk={c['chunk_index']}"
    print(f"  ↪ Open: {path}")

- Source: CA-AB2013.pdf, Chunk: 6
  ↪ Open: /pdf_viewer/CA-AB2013.pdf#chunk=6
- Source: WA-1168.pdf, Chunk: 3
  ↪ Open: /pdf_viewer/WA-1168.pdf#chunk=3
- Source: OJ_L_202401689_EN_TXT.pdf, Chunk: 198
  ↪ Open: /pdf_viewer/OJ_L_202401689_EN_TXT.pdf#chunk=198
- Source: OJ_L_202401689_EN_TXT.pdf, Chunk: 198
  ↪ Open: /pdf_viewer/OJ_L_202401689_EN_TXT.pdf#chunk=198


ChromaDB -	Fast, persistent, vector store with easy metadata filters

Embeddings - Semantic similarity for retrieval

Chunking w/ overlap - Keeps context and flow between chunks

Source Highlighting	- Adds transparency and legal traceability

RAG Prompting - Combines knowledge + LLM fluency