## 1. Preprocessing and Saving Metadata

In [1]:
import os
import json
import pandas as pd
import re

In [2]:
def preprocess_text(text):
    """Clean and normalize text."""
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [3]:
def standardize_date(date):
    """Standardize date to YYYY-MM-DD format."""
    try:
        if len(date) == 4:  # Year only
            return pd.to_datetime(f"{date}-01-01").strftime("%Y-%m-%d")
        elif len(date) == 7:  # Year and month
            return pd.to_datetime(f"{date}-01").strftime("%Y-%m-%d")
        else:  # Full date
            return pd.to_datetime(date).strftime("%Y-%m-%d")
    except Exception:
        return None 

In [4]:
def preprocess_and_save_metadata(json_dir, metadata_file):
    """Preprocess raw JSON files and save metadata for FAISS."""
    all_data = []
    for file_name in os.listdir(json_dir):
        if file_name.endswith(".json"):
            with open(os.path.join(json_dir, file_name), "r") as f:
                data = json.load(f)
                row = {
                    "file": file_name,
                    "name": preprocess_text(data.get("name", "")),
                    "abbreviation": preprocess_text(data.get("name_abbreviation", "")),
                    "decision_date": standardize_date(data.get("decision_date", "")),
                    "text": " ".join(opinion.get("text", "") for opinion in data.get("casebody", {}).get("opinions", [])),
                }
                all_data.append(row)
    
    # Save processed metadata
    with open(metadata_file, "w") as f:
        json.dump(all_data, f, indent=4)
    print(f"Metadata saved to {metadata_file}")


In [5]:
json_dir = "json/"  # Folder containing raw JSON files
metadata_file = "data/metadata_faiss.json"  # Metadata file for FAISS
preprocess_and_save_metadata(json_dir, metadata_file)

Metadata saved to data/metadata_faiss.json


## 2. Creating FAISS Index

In [6]:
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

In [7]:
def embed_text(text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """Generate embeddings for text."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

In [8]:
def create_faiss_index(metadata_file, index_file):
    """Create a FAISS index from metadata."""
    with open(metadata_file, "r") as f:
        metadata = json.load(f)
    
    # Generate embeddings for each text
    embeddings = [embed_text(entry["text"]) for entry in metadata]
    embeddings = np.vstack(embeddings)  # Combine embeddings into a matrix

    # Create and save FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    faiss.write_index(index, index_file)
    print(f"FAISS index saved to {index_file}")

In [9]:
create_faiss_index("data/metadata_faiss.json", "data/legal_cases_index.faiss")

2024-12-13 18:01:42.821353: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


FAISS index saved to data/legal_cases_index.faiss


## 3. Interactive Query System

In [10]:
#!pip install rapidfuzz

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from difflib import SequenceMatcher

In [14]:
from rapidfuzz import fuzz

def is_similar(a, b, threshold=80):
    """Check if two strings are similar using fuzz.partial_ratio."""
    return fuzz.partial_ratio(a, b) > threshold

def handle_partial_matches(query, metadata, threshold=80):
    """Retrieve and rank partial matches."""
    query_normalized = preprocess_query(query)
    results = []

    for entry in metadata:
        name = preprocess_query(entry.get("name", ""))
        abbreviation = preprocess_query(entry.get("abbreviation", ""))
        text = preprocess_query(entry.get("text", ""))

        if fuzz.partial_ratio(query_normalized, name) > threshold or \
           fuzz.partial_ratio(query_normalized, abbreviation) > threshold or \
           fuzz.partial_ratio(query_normalized, text) > threshold:
            results.append(entry)

    return results


In [15]:
def preprocess_query(query):
    """Normalize and clean the query for better matching."""
    query = query.strip().lower()
    query = re.sub(r'[^\w\s-]', '', query)
    query = re.sub(r'\s+', ' ', query)
    
    # Remove filler words
    filler_words = [
        'what about', 'can you', 'please', 'show me', 'find', 
        'search for', 'give me', 'how about', 'tell me about',
        'what is', 'on', 'the case on', 'case from', 'is there a case'
    ]
    for filler in filler_words:
        query = re.sub(r'\b' + re.escape(filler) + r'\b', '', query)

    # Extract date if present in the query
    match = re.search(r'\d{4}-\d{2}-\d{2}', query)
    if match:
        return match.group(0)  # Return the date in YYYY-MM-DD format

    return query.strip()

In [16]:
def query_index(user_query, index, metadata, k=5):
    """Query FAISS index and return top results with partial matching."""
    query_embedding = embed_text(user_query)
    distances, indices = index.search(query_embedding, k)
    results = []

    for i, idx in enumerate(indices[0]):
        metadata_entry = metadata[idx]
        results.append({
            "rank": i + 1,
            "file": metadata_entry.get("file", "Unknown"),
            "name": metadata_entry.get("name", "Unknown"),
            "abbreviation": metadata_entry.get("abbreviation", "Unknown"),
            "text": metadata_entry.get("text", "No text available"),
            "distance": float(distances[0][i]),
        })

    # partial matching from metadata
    partial_matches = []
    for entry in metadata:
        name = entry.get("name", "").lower()
        abbreviation = entry.get("abbreviation", "").lower()
        if user_query in name or user_query in abbreviation or is_similar(user_query, name) or is_similar(user_query, abbreviation):
            partial_matches.append({
                "rank": "Partial",
                "file": entry.get("file", "Unknown"),
                "name": entry.get("name", "Unknown"),
                "abbreviation": entry.get("abbreviation", "Unknown"),
                "text": entry.get("text", "No text available"),
                "distance": "N/A",
            })

    return results + partial_matches

In [17]:
def generate_summary(query, results):
    """Summarize content from retrieved results."""
    if not results:
        return "No relevant results to summarize."

    # Filter results for meaningful text
    context = " ".join([entry.get("text", "")[:512] for entry in results if entry.get("text")])
    if not context.strip():
        return "No sufficient text available to summarize."

    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
    inputs = tokenizer(f"Query: {query} Context: {context}", return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = model.generate(inputs.input_ids, max_length=200, min_length=50, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [18]:
def query_system(index, metadata):
    """Interactive query system for legal cases."""
    while True:
        print("\nWelcome to the Legal Case Retrieval System!")
        print("\nType 'exit' at any point to quit.\n")
        print("\nSelect a query type:")
        print("1. Search by Name")
        print("2. Search by Abbreviation")
        print("3. Search by Decision Date")
        print("4. Custom Query")
        choice = input("Enter choice (1-4 or 'exit'): ").strip()

        if choice.lower() == "exit":
            print("Exiting the system.")
            break

        query = input("Enter your query: ").strip()
        if choice == "3" or re.search(r'\d{4}-\d{2}-\d{2}', query):
            query = preprocess_query(query)  # Normalize and extract date if present
            results = [entry for entry in metadata if entry.get("decision_date") == query]
        else:
            results = handle_partial_matches(query, metadata)

        if not results:
            print("No matches found. Try refining your query.")
            continue

        # Display top 5 results
        print("\nResults (Top 5):")
        results = results[:5]
        for i, result in enumerate(results, start=1):
            print(f"{i}. {result.get('name', 'Unknown')} "
                  f"(Decision Date: {result.get('decision_date', 'Unknown')})")

        # Allow user to select results for summarization
        summary_choice = input("\nEnter the indices of results to summarize (comma-separated, e.g., 1,2): ").strip()
        if not summary_choice:
            print("No results selected for summarization.")
            continue

        try:
            selected_indices = [int(idx) - 1 for idx in summary_choice.split(",") if idx.strip().isdigit()]
            selected_results = [results[idx] for idx in selected_indices if 0 <= idx < len(results)]
        except (ValueError, IndexError):
            print("Invalid choice(s) entered. Please try again.")
            continue

        if not selected_results:
            print("No valid selections made for summarization.")
            continue

        # Generate and display summary
        summary = generate_summary(query, selected_results)
        print(f"\nSummary:\n{summary}")


In [19]:
index = faiss.read_index("data/legal_cases_index.faiss")
with open("data/metadata_faiss.json", "r") as f:
    metadata = json.load(f)

In [None]:
query_system(index, metadata)


Welcome to the Legal Case Retrieval System!

Type 'exit' at any point to quit.


Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date
4. Custom Query


Enter choice (1-4 or 'exit'):  1
Enter your query:  1



Results (Top 5):
1. in re jesse scott oliver minor (Decision Date: 1887-10-31)
2. united states v the northwest trading co et al (Decision Date: 1888-08-20)
3. myers v swineford (Decision Date: 1888-08-24)
4. ex parte dubuque (Decision Date: 1888-09-20)
5. garside v norval (Decision Date: 1888-10-22)



Enter the indices of results to summarize (comma-separated, e.g., 1,2):  Dunbar


No valid selections made for summarization.

Welcome to the Legal Case Retrieval System!

Type 'exit' at any point to quit.


Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date
4. Custom Query


Enter choice (1-4 or 'exit'):  1
Enter your query:  Dunbar



Results (Top 5):
1. dunbar v de groff (Decision Date: 1888-10-26)



Enter the indices of results to summarize (comma-separated, e.g., 1,2):  1





Summary:
This is an application by one of the parties, by petition in open court, for an order for leave to take the deposition, out of the District of Alaska, of certain witnesses. Two other motions of a similar character were filed at the same time, and will be disposed of in the same way.

Welcome to the Legal Case Retrieval System!

Type 'exit' at any point to quit.


Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date
4. Custom Query
