In [2]:
import os
import re
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# ======= CONFIG =======
SUMMARY_FOLDER = "summary"
JSON_OUTPUT = "judgments.json"
FAISS_INDEX = "judgments.index"
EMBEDDING_MODEL = "bhavyagiri/InLegal-Sbert"
# ======================

# Load embedding model
model = SentenceTransformer(EMBEDDING_MODEL)

# Regex patterns for flexible section detection
section_patterns = {
    "case_facts": re.compile(r"(case\s*facts)", re.IGNORECASE),
    "issues": re.compile(r"(issues?)", re.IGNORECASE),
    "arguments": re.compile(r"(arguments?|reasoning)", re.IGNORECASE),
    "decision": re.compile(r"(decision|holding|judgment)", re.IGNORECASE)
}

def extract_sections_fuzzy(text):
    sections = {k: "" for k in section_patterns.keys()}
    current_key = None

    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue

        # Check if line matches any section heading
        matched = False
        for key, pattern in section_patterns.items():
            if pattern.search(line):  # If heading found
                current_key = key
                matched = True
                break
        
        # If line is not heading, add to current section
        if current_key and not matched:
            sections[current_key] += line + " "

    return sections

# Process all summaries in folder
documents = []
for file_name in os.listdir(SUMMARY_FOLDER):
    if not file_name.endswith(".txt"):
        continue

    file_path = os.path.join(SUMMARY_FOLDER, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    sections = extract_sections_fuzzy(text)
    doc_id = os.path.splitext(file_name)[0]

    doc = {
        "id": doc_id,
        "case_facts": sections["case_facts"].strip(),
        "issues": sections["issues"].strip(),
        "arguments": sections["arguments"].strip(),
        "decision": sections["decision"].strip()
    }
    documents.append(doc)

# Save as JSON
with open(JSON_OUTPUT, "w", encoding="utf-8") as f:
    json.dump(documents, f, indent=4, ensure_ascii=False)

# Build FAISS index
texts = [doc["case_facts"] + " " + doc["issues"] + " " + doc["arguments"] + " " + doc["decision"] for doc in documents]
embeddings = model.encode(texts, convert_to_numpy=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, FAISS_INDEX)

print(f"Processed {len(documents)} documents. JSON and FAISS index saved.")


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Processed 933 documents. JSON and FAISS index saved.


In [3]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import pandas as pd
import re
import os

# --- Configuration (Update as needed) ---
EMBEDDING_MODEL = "bhavyagiri/InLegal-Sbert"
CSV_FILE = "ipc_formatted_clean.csv"
INDEX_FILE = "ipc.index"
METADATA_FILE = "ipc.json"

# --- 1. Load and Prepare Data ---

# Attempt to load the file using a permissive encoding to avoid Unicode errors
ipc = pd.read_csv(CSV_FILE, encoding='cp1252')

# Fix the 'AttributeError: 'float' object has no attribute 'strip'' by filling NaN values.
# This is crucial for robust string processing.
columns_to_clean = ['Section Code', 'Description', 'Punishment/Consequence']
for col in columns_to_clean:
    if col in ipc.columns:
        ipc[col] = ipc[col].fillna('').astype(str)

# --- 2. Format Data for Embeddings (The "final_text" creation logic) ---

def format_section(row):
    """Formats a single row into a coherent document block for the Sentence Transformer."""
    section_code = row["Section Code"].strip()
    description = row["Description"].strip()
    punishment = row["Punishment/Consequence"].strip()
    
    # Create the clean, dense text block for semantic search
    return f"""
SECTION: {section_code}
DESCRIPTION: {description}
PUNISHMENT: {punishment}
"""

# Apply the function to the DataFrame to get a list of formatted documents
formatted_series = ipc.apply(format_section, axis=1)

# Documents for Embedding (the core text) and Metadata (for lookup)
documents = formatted_series.tolist()
metadata = ipc.rename(columns={'Section Code': 'section_no', 'Punishment/Consequence': 'punishment_raw'}).to_dict('records')

# --- 3. Create Embeddings and FAISS Index ---

# Load embedding model
embedder = SentenceTransformer(EMBEDDING_MODEL)

# Generate embeddings for all documents
embeddings = embedder.encode(documents)

# Convert to a float32 NumPy array, as required by FAISS
embeddings_np = np.array(embeddings).astype("float32")
dimension = embeddings_np.shape[1]

# Create a FAISS Index (IndexFlatL2 uses Euclidean distance for search)
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# --- 4. Save Index and Metadata ---

# Save the FAISS index
faiss.write_index(index, INDEX_FILE)

# Save the metadata (original data) mapped by its index position
with open(METADATA_FILE, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)

print(f"Successfully created and saved {len(documents)} IPC sections:")
print(f"Index: {INDEX_FILE}")
print(f"Metadata: {METADATA_FILE}")

Successfully created and saved 511 IPC sections:
Index: ipc.index
Metadata: ipc.json
