In [20]:
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import re

def parse_judgment_sections(content):
    """
    Parse judgment text into sections: Case facts, Issues, Arguments/Reasoning, Decision/Holding.
    
    Args:
        content: Full text content of judgment
        
    Returns:
        Dictionary with sections as keys
    """
    sections = {
        'case_facts': '',
        'issues': '',
        'arguments': '',
        'decision': ''
    }
    
    # Define section patterns (case-insensitive)
    patterns = {
        'case_facts': r'Case facts?:(.+?)(?=Issues?:|Arguments?|Decision|Holding|$)',
        'issues': r'Issues?:(.+?)(?=Arguments?|Reasoning|Decision|Holding|$)',
        'arguments': r'(?:Arguments?|Reasoning)[:/](.+?)(?=Decision|Holding|$)',
        'decision': r'(?:Decision|Holding)[:/](.+?)$'
    }
    
    for section_name, pattern in patterns.items():
        match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
        if match:
            sections[section_name] = match.group(1).strip()
    
    return sections

def read_judgment_files(folder_path):
    """
    Read all text files from a folder and parse them into sections.
    
    Args:
        folder_path: Path to folder containing text files
        
    Returns:
        List of dictionaries with filename and sections
    """
    judgments = []
    folder = Path(folder_path)
    
    if not folder.exists():
        raise FileNotFoundError(f"Folder not found: {folder_path}")
    
    # Read all .txt files
    for file_path in folder.glob("*.txt"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if content:
                    sections = parse_judgment_sections(content)
                    judgments.append({
                        'filename': file_path.name,
                        'sections': sections
                    })
                    print(f"Read: {file_path.name}")
                    print(f"  - Case facts: {len(sections['case_facts'])} chars")
                    print(f"  - Issues: {len(sections['issues'])} chars")
                    print(f"  - Arguments: {len(sections['arguments'])} chars")
                    print(f"  - Decision: {len(sections['decision'])} chars")
        except Exception as e:
            print(f"Error reading {file_path.name}: {e}")
    
    return judgments

def create_section_embeddings(judgments, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    """
    Create embeddings for each section, but maintain as single block per judgment.
    
    Args:
        judgments: List of judgment dictionaries
        model_name: Name of the sentence transformer model
        
    Returns:
        tuple: (embeddings_array, metadata_list)
    """
    print(f"\nLoading model: {model_name}")
    model = SentenceTransformer(model_name)
    
    all_embeddings = []
    metadata_list = []
    
    section_names = ['case_facts', 'issues', 'arguments', 'decision']
    
    for judgment in judgments:
        # Create embeddings for each section
        section_embeddings = []
        
        for section_name in section_names:
            section_text = judgment['sections'][section_name]
            if section_text:
                embedding = model.encode(section_text, convert_to_numpy=True)
                section_embeddings.append(embedding)
            else:
                # If section is empty, use zero vector
                dimension = model.get_sentence_embedding_dimension()
                section_embeddings.append(np.zeros(dimension))
        
        # Stack section embeddings horizontally (concatenate)
        # This creates a single vector representing all 4 sections
        combined_embedding = np.concatenate(section_embeddings)
        
        all_embeddings.append(combined_embedding)
        
        # Store metadata - only sections, no duplication
        metadata_list.append({
            'filename': judgment['filename'],
            'sections': judgment['sections']
        })
    
    embeddings_array = np.array(all_embeddings)
    
    # Normalize embeddings for cosine similarity
    embeddings_array = embeddings_array / np.linalg.norm(embeddings_array, axis=1, keepdims=True)
    
    return embeddings_array, metadata_list, model

def create_faiss_index(embeddings):
    """
    Create FAISS index from embeddings.
    
    Args:
        embeddings: numpy array of embeddings
        
    Returns:
        FAISS index
    """
    dimension = embeddings.shape[1]
    print(f"\nCreating FAISS index (dimension: {dimension})...")
    
    # Using IndexFlatIP for cosine similarity (after normalization)
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings.astype('float32'))
    
    print(f"FAISS index created with {index.ntotal} vectors")
    print(f"Each vector represents 4 sections (dimension = {dimension/4} per section)")
    
    return index

def save_index_and_metadata(index, metadata, output_dir='faiss_index'):
    """
    Save FAISS index and metadata to disk (JSON format for metadata).
    
    Args:
        index: FAISS index
        metadata: List of metadata dictionaries
        output_dir: Directory to save files
    """
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # Save FAISS index
    index_file = output_path / 'judgments.index'
    faiss.write_index(index, str(index_file))
    print(f"\nFAISS index saved to: {index_file}")
    
    # Save metadata as JSON
    metadata_file = output_path / 'metadata.json'
    with open(metadata_file, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    print(f"Metadata saved to: {metadata_file}")

def load_index_and_metadata(input_dir='faiss_index'):
    """
    Load FAISS index and metadata from disk.
    
    Args:
        input_dir: Directory containing saved files
        
    Returns:
        tuple: (index, metadata)
    """
    input_path = Path(input_dir)
    
    # Load FAISS index
    index_file = input_path / 'judgments.index'
    index = faiss.read_index(str(index_file))
    print(f"FAISS index loaded from: {index_file}")
    
    # Load metadata from JSON
    metadata_file = input_path / 'metadata.json'
    with open(metadata_file, 'r', encoding='utf-8') as f:
        metadata = json.load(f)
    print(f"Metadata loaded from: {metadata_file}")
    
    return index, metadata

def search_by_section(query, section_type, index, metadata, model, top_k=5):
    """
    Search for similar judgments by focusing on a specific section.
    
    Args:
        query: Query text
        section_type: 'case_facts', 'issues', 'arguments', or 'decision'
        index: FAISS index
        metadata: List of metadata dictionaries
        model: SentenceTransformer model
        top_k: Number of top results to return
        
    Returns:
        List of tuples: (filename, similarity_score, relevant_section_text)
    """
    section_names = ['case_facts', 'issues', 'arguments', 'decision']
    section_index = section_names.index(section_type)
    
    # Generate query embedding
    query_embedding = model.encode(query, convert_to_numpy=True)
    
    # Create full embedding vector with query in the right position
    dimension = model.get_sentence_embedding_dimension()
    full_embedding = np.zeros(dimension * 4)
    full_embedding[section_index * dimension:(section_index + 1) * dimension] = query_embedding
    
    # Normalize
    full_embedding = full_embedding / np.linalg.norm(full_embedding)
    full_embedding = full_embedding.reshape(1, -1)
    
    # Search
    distances, indices = index.search(full_embedding.astype('float32'), top_k)
    
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        if idx < len(metadata):
            meta = metadata[idx]
            section_text = meta['sections'][section_type]
            text_snippet = section_text[:200] + "..." if len(section_text) > 200 else section_text
            results.append((meta['filename'], float(distance), text_snippet))
    
    return results

def search_all_sections(query, index, metadata, model, top_k=5):
    """
    Search across all sections equally.
    
    Args:
        query: Query text
        index: FAISS index
        metadata: List of metadata dictionaries
        model: SentenceTransformer model
        top_k: Number of top results to return
        
    Returns:
        List of tuples: (filename, similarity_score, sections_dict)
    """
    # Generate query embedding for all sections
    query_embedding = model.encode(query, convert_to_numpy=True)
    
    # Repeat query embedding 4 times (for all sections)
    full_embedding = np.tile(query_embedding, 4)
    
    # Normalize
    full_embedding = full_embedding / np.linalg.norm(full_embedding)
    full_embedding = full_embedding.reshape(1, -1)
    
    # Search
    distances, indices = index.search(full_embedding.astype('float32'), top_k)
    
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        if idx < len(metadata):
            meta = metadata[idx]
            results.append((meta['filename'], float(distance), meta['sections']))
    
    return results

# Main execution
if __name__ == "__main__":
    # Configuration
    FOLDER_PATH = "summary"  # Change this to your folder path
    OUTPUT_DIR = "./"
    
    try:
        # Step 1: Read and parse judgment files
        print("=" * 60)
        print("STEP 1: Reading and parsing judgment files")
        print("=" * 60)
        judgments = read_judgment_files(FOLDER_PATH)
        print(f"\nTotal files read: {len(judgments)}")
        
        if not judgments:
            print("No judgment files found. Please check the folder path.")
            exit(1)
        
        # Step 2: Create section embeddings (4 sections per judgment as single block)
        print("\n" + "=" * 60)
        print("STEP 2: Creating section embeddings")
        print("=" * 60)
        embeddings, metadata_list, model = create_section_embeddings(judgments)
        print(f"\nGenerated {len(embeddings)} judgment vectors")
        print(f"Each vector contains 4 section embeddings concatenated")
        
        # Step 3: Create FAISS index
        print("\n" + "=" * 60)
        print("STEP 3: Creating FAISS index")
        print("=" * 60)
        index = create_faiss_index(embeddings)
        
        # Step 4: Save index and metadata (JSON format)
        print("\n" + "=" * 60)
        print("STEP 4: Saving index and metadata")
        print("=" * 60)
        save_index_and_metadata(index, metadata_list, OUTPUT_DIR)
        
        
        
    except Exception as e:
        print(f"\nError: {e}")
        import traceback
        traceback.print_exc()

STEP 1: Reading and parsing judgment files
Read: 30.txt
  - Case facts: 615 chars
  - Issues: 266 chars
  - Arguments: 873 chars
  - Decision: 192 chars
Read: 38.txt
  - Case facts: 497 chars
  - Issues: 193 chars
  - Arguments: 534 chars
  - Decision: 126 chars
Read: 7.txt
  - Case facts: 501 chars
  - Issues: 307 chars
  - Arguments: 736 chars
  - Decision: 237 chars

Total files read: 3

STEP 2: Creating section embeddings

Loading model: sentence-transformers/all-MiniLM-L6-v2

Generated 3 judgment vectors
Each vector contains 4 section embeddings concatenated

STEP 3: Creating FAISS index

Creating FAISS index (dimension: 1536)...
FAISS index created with 3 vectors
Each vector represents 4 sections (dimension = 384.0 per section)

STEP 4: Saving index and metadata

FAISS index saved to: judgments.index
Metadata saved to: metadata.json


In [11]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import pandas as pd
import re
import os

# --- Configuration (Update as needed) ---
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
CSV_FILE = "ipc_formatted_clean.csv"
INDEX_FILE = "ipc.index"
METADATA_FILE = "ipc.json"

# --- 1. Load and Prepare Data ---

# Attempt to load the file using a permissive encoding to avoid Unicode errors
ipc = pd.read_csv(CSV_FILE, encoding='cp1252')

# Fix the 'AttributeError: 'float' object has no attribute 'strip'' by filling NaN values.
# This is crucial for robust string processing.
columns_to_clean = ['Section Code', 'Description', 'Punishment/Consequence']
for col in columns_to_clean:
    if col in ipc.columns:
        ipc[col] = ipc[col].fillna('').astype(str)

# --- 2. Format Data for Embeddings (The "final_text" creation logic) ---

def format_section(row):
    """Formats a single row into a coherent document block for the Sentence Transformer."""
    section_code = row["Section Code"].strip()
    description = row["Description"].strip()
    punishment = row["Punishment/Consequence"].strip()
    
    # Create the clean, dense text block for semantic search
    return f"""
SECTION: {section_code}
DESCRIPTION: {description}
PUNISHMENT: {punishment}
"""

# Apply the function to the DataFrame to get a list of formatted documents
formatted_series = ipc.apply(format_section, axis=1)

# Documents for Embedding (the core text) and Metadata (for lookup)
documents = formatted_series.tolist()
metadata = ipc.rename(columns={'Section Code': 'section_no', 'Punishment/Consequence': 'punishment_raw'}).to_dict('records')

# --- 3. Create Embeddings and FAISS Index ---

# Load embedding model
embedder = SentenceTransformer(EMBEDDING_MODEL)

# Generate embeddings for all documents
embeddings = embedder.encode(documents)

# Convert to a float32 NumPy array, as required by FAISS
embeddings_np = np.array(embeddings).astype("float32")
dimension = embeddings_np.shape[1]

# Create a FAISS Index (IndexFlatL2 uses Euclidean distance for search)
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# --- 4. Save Index and Metadata ---

# Save the FAISS index
faiss.write_index(index, INDEX_FILE)

# Save the metadata (original data) mapped by its index position
with open(METADATA_FILE, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)

print(f"Successfully created and saved {len(documents)} IPC sections:")
print(f"Index: {INDEX_FILE}")
print(f"Metadata: {METADATA_FILE}")

Successfully created and saved 511 IPC sections:
Index: ipc.index
Metadata: ipc.json
