In [22]:
import json
import re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


In [None]:

# --- Configuration ---
INPUT_FILE = '../Data/CUADv1.json'
OUTPUT_CHUNKS_FILE = '../Data/processed_chunks.csv'
OUTPUT_EMBEDDINGS_FILE = '../Data/embeddings.npy'
NUM_CONTRACTS_TO_PROCESS = 100  # Subset for the 5-day sprint
CHUNK_SIZE = 500  # Words
OVERLAP = 100     # Words

# --- Helper Functions ---

def clean_text(text):
    # normalize weird spaces but keep paragraph breaks
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    # collapse 3+ newlines → 2, and 2+ spaces → 1
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]{2,}', ' ', text)
    return text.strip()

def chunk_text(text, chunk_size, overlap):
    """Splits text into overlapping chunks based on word count."""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

: 

In [19]:
# --- Main Processing ---

print(f"Loading data from {INPUT_FILE}...")
with open(INPUT_FILE, 'r') as f:
    data = json.load(f)

# List to hold all processed chunk data
all_chunks_data = []

print(f"Processing {NUM_CONTRACTS_TO_PROCESS} contracts...")
for i, article in enumerate(tqdm(data['data'], total=NUM_CONTRACTS_TO_PROCESS)):
    if i >= NUM_CONTRACTS_TO_PROCESS:
        break
        
    contract_title = article['title']
    
    # --- THIS IS THE FIX ---
    # Iterate over ALL paragraphs in the article
    for para_index, paragraph in enumerate(article['paragraphs']):
        context = paragraph['context']
        
        # 1. Clean the text
        cleaned_context = clean_text(context)
        
        # 2. Chunk the text
        chunks = chunk_text(cleaned_context, CHUNK_SIZE, OVERLAP)
        
        # 3. Store chunks and metadata
        for chunk_index, chunk in enumerate(chunks):
            # Create a unique ID for every chunk
            chunk_id = f"{contract_title}_{para_index}_{chunk_index}"
            all_chunks_data.append({
                'contract_title': contract_title,
                'paragraph_index': para_index,
                'chunk_id': chunk_id,
                'chunk_text': chunk
            })
    # --- END OF FIX ---

print(f"\nCreated {len(all_chunks_data)} chunks from {NUM_CONTRACTS_TO_PROCESS} contracts.")

Loading data from ../Data/CUADv1.json...
Processing 100 contracts...


100%|██████████| 100/100 [00:00<00:00, 870.56it/s]


Created 2009 chunks from 100 contracts.





In [20]:
# --- Embedding Generation ---

print("Loading SBERT model (all-MiniLM-L6-v2)...")
# This will download the model the first time you run it
model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract just the text to be embedded
chunk_texts_to_embed = [d['chunk_text'] for d in all_chunks_data]

print(f"Generating embeddings for {len(chunk_texts_to_embed)} chunks...")
# This is the most time-consuming step
# .encode() shows a progress bar by default
all_embeddings = model.encode(chunk_texts_to_embed, show_progress_bar=True)



Loading SBERT model (all-MiniLM-L6-v2)...
Generating embeddings for 2009 chunks...
Generating embeddings for 2009 chunks...


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

In [21]:
# --- Saving Output ---

# 1. Save the processed chunks and metadata
df_chunks = pd.DataFrame(all_chunks_data)
df_chunks.to_csv(OUTPUT_CHUNKS_FILE, index=False)
print(f"\nSuccessfully saved processed chunks to {OUTPUT_CHUNKS_FILE}")

# 2. Save the embeddings as a NumPy file
np.save(OUTPUT_EMBEDDINGS_FILE, all_embeddings)
print(f"Successfully saved embeddings to {OUTPUT_EMBEDDINGS_FILE}")
print("\nData processing complete. You are ready to try FAISS again.")


Successfully saved processed chunks to ../Data/processed_chunks.csv
Successfully saved embeddings to ../Data/embeddings.npy

Data processing complete. You are ready to try FAISS again.
