## 1. Preprocessing and Saving Metadata

In [1]:
import os
import json
import pandas as pd
import re

In [2]:
def preprocess_text(text):
    """Clean and normalize text."""
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [3]:
def standardize_date(date):
    """Standardize date to YYYY-MM-DD format."""
    try:
        if len(date) == 4:  # Year only
            return pd.to_datetime(f"{date}-01-01").strftime("%Y-%m-%d")
        elif len(date) == 7:  # Year and month
            return pd.to_datetime(f"{date}-01").strftime("%Y-%m-%d")
        else:  # Full date
            return pd.to_datetime(date).strftime("%Y-%m-%d")
    except Exception:
        return None 

In [4]:
def preprocess_and_save_metadata(json_dir, metadata_file):
    """Preprocess raw JSON files and save metadata for FAISS."""
    all_data = []
    for file_name in os.listdir(json_dir):
        if file_name.endswith(".json"):
            with open(os.path.join(json_dir, file_name), "r") as f:
                data = json.load(f)
                row = {
                    "file": file_name,
                    "name": preprocess_text(data.get("name", "")),
                    "abbreviation": preprocess_text(data.get("name_abbreviation", "")),
                    "decision_date": standardize_date(data.get("decision_date", "")),
                    "text": " ".join(opinion.get("text", "") for opinion in data.get("casebody", {}).get("opinions", [])),
                }
                all_data.append(row)
    
    # Save processed metadata
    with open(metadata_file, "w") as f:
        json.dump(all_data, f, indent=4)
    print(f"Metadata saved to {metadata_file}")


In [5]:
json_dir = "json/"  # Folder containing raw JSON files
metadata_file = "data/metadata_faiss.json"  # Metadata file for FAISS
preprocess_and_save_metadata(json_dir, metadata_file)

Metadata saved to data/metadata_faiss.json


## 2. Creating FAISS Index

In [6]:
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

In [7]:
def embed_text(text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """Generate embeddings for text."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

In [8]:
def create_faiss_index(metadata_file, index_file):
    """Create a FAISS index from metadata."""
    with open(metadata_file, "r") as f:
        metadata = json.load(f)
    
    # Generate embeddings for each text
    embeddings = [embed_text(entry["text"]) for entry in metadata]
    embeddings = np.vstack(embeddings)  # Combine embeddings into a matrix

    # Create and save FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    faiss.write_index(index, index_file)
    print(f"FAISS index saved to {index_file}")

In [9]:
create_faiss_index("data/metadata_faiss.json", "data/legal_cases_index.faiss")

2024-12-10 01:20:02.148596: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


FAISS index saved to data/legal_cases_index.faiss


## 3. Interactive Query System

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from difflib import SequenceMatcher

In [11]:
def is_similar(a, b, threshold=0.8):
    """Check if two strings are similar using SequenceMatcher."""
    return SequenceMatcher(None, a, b).ratio() > threshold

def handle_partial_matches(query, metadata):
    """Retrieve and rank partial matches."""
    query_normalized = preprocess_query(query)
    results = []

    for entry in metadata:
        name = preprocess_query(entry.get("name", ""))
        abbreviation = preprocess_query(entry.get("abbreviation", ""))
        
        if query_normalized in name or query_normalized in abbreviation or is_similar(query_normalized, name) or is_similar(query_normalized, abbreviation):
            results.append({
                "name": entry.get("name", "Unknown"),
                "abbreviation": entry.get("abbreviation", "Unknown"),
                "decision_date": entry.get("decision_date", "Unknown"),
                "jurisdiction": entry.get("jurisdiction", "Unknown"),
                "text": entry.get("cleaned_text", "No text available")
            })

    return results

In [12]:
def preprocess_query(query):
    """Normalize and clean the query for better matching."""
    query = query.strip().lower()
    query = re.sub(r'[^\w\s-]', '', query)
    query = re.sub(r'\s+', ' ', query)
    
    # Remove filler words
    filler_words = [
        'what about', 'can you', 'please', 'show me', 'find', 
        'search for', 'give me', 'how about', 'tell me about'
    ]
    for filler in filler_words:
        query = re.sub(r'\b' + re.escape(filler) + r'\b', '', query)

    return query.strip()


In [13]:
def query_index(user_query, index, metadata, k=5):
    """Query FAISS index and return top results with partial matching."""
    query_embedding = embed_text(user_query)
    distances, indices = index.search(query_embedding, k)
    results = []

    for i, idx in enumerate(indices[0]):
        metadata_entry = metadata[idx]
        results.append({
            "rank": i + 1,
            "file": metadata_entry.get("file", "Unknown"),
            "name": metadata_entry.get("name", "Unknown"),
            "abbreviation": metadata_entry.get("abbreviation", "Unknown"),
            "text": metadata_entry.get("text", "No text available"),
            "distance": float(distances[0][i]),
        })

    # partial matching from metadata
    partial_matches = []
    for entry in metadata:
        name = entry.get("name", "").lower()
        abbreviation = entry.get("abbreviation", "").lower()
        if user_query in name or user_query in abbreviation or is_similar(user_query, name) or is_similar(user_query, abbreviation):
            partial_matches.append({
                "rank": "Partial",
                "file": entry.get("file", "Unknown"),
                "name": entry.get("name", "Unknown"),
                "abbreviation": entry.get("abbreviation", "Unknown"),
                "text": entry.get("text", "No text available"),
                "distance": "N/A",
            })

    return results + partial_matches

In [14]:
def generate_summary(query, results):
    """Summarize content from retrieved results."""
    if not results:
        return "No relevant results to summarize."
    
    context = " ".join([doc.get("text", "")[:1024] for doc in results])  # Limit length for summarization
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
    inputs = tokenizer(f"Query: {query} Context: {context}", return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = model.generate(inputs.input_ids, max_length=200, min_length=50, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [18]:
def query_system(index, metadata):
    """Interactive query system for legal cases."""
    while True:
        print("\nSelect a query type:")
        print("1. Search by Name")
        print("2. Search by Abbreviation")
        print("3. Search by Decision Date")
        print("4. Custom Query")
        choice = input("Enter choice (1-4 or 'exit'): ").strip()

        if choice.lower() == "exit":
            print("Exiting the system.")
            break

        query = input("Enter your query: ").strip()
        if choice == "3" and not re.match(r"\d{4}-\d{2}-\d{2}", query):
            print("Invalid date format. Please use YYYY-MM-DD.")
            continue

        results = handle_partial_matches(query, metadata)

        if not results:
            print("No matches found. Try refining your query.")
            continue

        print("\nResults:")
        for i, result in enumerate(results, start=1):
            print(f"{i}. {result['name']} (Decision Date: {result['decision_date']}, Jurisdiction: {result['jurisdiction']})")

        summary_choice = input("\nSummarize results? (yes/no): ").strip().lower()
        if summary_choice == "yes":
            summary = generate_summary(query, results)
            print(f"\nSummary:\n{summary}")


In [19]:
index = faiss.read_index("data/legal_cases_index.faiss")
with open("data/metadata_faiss.json", "r") as f:
    metadata = json.load(f)

In [20]:
query_system(index, metadata)


Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date
4. Custom Query


Enter choice (1-4 or 'exit'):  1
Enter your query:  pratt



Results:
1. pratt et al v united alaska min co (Decision Date: 1900-10-01, Jurisdiction: Unknown)



Summarize results? (yes/no):  yes



Summary:
 query: pratt context: No text available. context: Pratt is the name of a famous American baseball player. pratt is also known as the son of legendary baseball player Babe Ruth. Context: Prat is the surname of Babe Ruth, who was born in 1901.

Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date
4. Custom Query


Enter choice (1-4 or 'exit'):  exit


Exiting the system.
