## 1. Preprocessing and Saving Metadata

In [1]:
import os
import json
import pandas as pd
import re

In [2]:
def preprocess_text(text):
    """Clean and normalize text."""
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [3]:
def standardize_date(date):
    """Standardize date to YYYY-MM-DD format."""
    try:
        if len(date) == 4:  # Year only
            return pd.to_datetime(f"{date}-01-01").strftime("%Y-%m-%d")
        elif len(date) == 7:  # Year and month
            return pd.to_datetime(f"{date}-01").strftime("%Y-%m-%d")
        else:  # Full date
            return pd.to_datetime(date).strftime("%Y-%m-%d")
    except Exception:
        return None 

In [5]:
def preprocess_and_save_metadata(json_dir, metadata_file):
    """Preprocess raw JSON files and save metadata for FAISS."""
    all_data = []
    for file_name in os.listdir(json_dir):
        if file_name.endswith(".json"):
            with open(os.path.join(json_dir, file_name), "r") as f:
                data = json.load(f)
                row = {
                    "file": file_name,
                    "name": preprocess_text(data.get("name", "")),
                    "abbreviation": preprocess_text(data.get("name_abbreviation", "")),
                    "decision_date": standardize_date(data.get("decision_date", "")),
                    "text": " ".join(opinion.get("text", "") for opinion in data.get("casebody", {}).get("opinions", [])),
                }
                all_data.append(row)
    
    # Save processed metadata
    with open(metadata_file, "w") as f:
        json.dump(all_data, f, indent=4)
    print(f"Metadata saved to {metadata_file}")


In [6]:
json_dir = "json/"  # Folder containing raw JSON files
metadata_file = "metadata_faiss.json"  # Metadata file for FAISS
preprocess_and_save_metadata(json_dir, metadata_file)

Metadata saved to metadata_faiss.json


## 2. Creating FAISS Index

In [7]:
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

In [8]:
def embed_text(text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """Generate embeddings for text."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

In [9]:
def create_faiss_index(metadata_file, index_file):
    """Create a FAISS index from metadata."""
    with open(metadata_file, "r") as f:
        metadata = json.load(f)
    
    # Generate embeddings for each text
    embeddings = [embed_text(entry["text"]) for entry in metadata]
    embeddings = np.vstack(embeddings)  # Combine embeddings into a matrix

    # Create and save FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    faiss.write_index(index, index_file)
    print(f"FAISS index saved to {index_file}")

In [10]:
create_faiss_index("metadata_faiss.json", "legal_cases_index.faiss")

2024-12-09 05:33:30.158393: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


FAISS index saved to legal_cases_index.faiss


## 3. Interactive Query System

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from difflib import SequenceMatcher

In [12]:
def is_similar(a, b, threshold=0.8):
    """Check if two strings are similar using SequenceMatcher."""
    return SequenceMatcher(None, a, b).ratio() > threshold

In [13]:
def preprocess_query(query):
    """Normalize and preprocess user query."""
    query = query.lower().strip()
    query = re.sub(r"[^\w\s]", "", query)
    query = re.sub(r"\s+", " ", query)
    return query

In [14]:
def query_index(user_query, index, metadata, k=5):
    """Query FAISS index and return top results with partial matching."""
    query_embedding = embed_text(user_query)
    distances, indices = index.search(query_embedding, k)
    results = []

    for i, idx in enumerate(indices[0]):
        metadata_entry = metadata[idx]
        results.append({
            "rank": i + 1,
            "file": metadata_entry.get("file", "Unknown"),
            "name": metadata_entry.get("name", "Unknown"),
            "abbreviation": metadata_entry.get("abbreviation", "Unknown"),
            "text": metadata_entry.get("text", "No text available"),
            "distance": float(distances[0][i]),
        })

    # partial matching from metadata
    partial_matches = []
    for entry in metadata:
        name = entry.get("name", "").lower()
        abbreviation = entry.get("abbreviation", "").lower()
        if user_query in name or user_query in abbreviation or is_similar(user_query, name) or is_similar(user_query, abbreviation):
            partial_matches.append({
                "rank": "Partial",
                "file": entry.get("file", "Unknown"),
                "name": entry.get("name", "Unknown"),
                "abbreviation": entry.get("abbreviation", "Unknown"),
                "text": entry.get("text", "No text available"),
                "distance": "N/A",
            })

    return results + partial_matches

In [15]:
def generate_summary(query, context, model_name="facebook/bart-large-cnn"):
    """Summarize context using BART."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    inputs = tokenizer(f"Query: {query} Context: {context}", return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = model.generate(inputs.input_ids, max_length=200, min_length=50, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [22]:
def query_system(index, metadata):
    """Interactive query system for retrieving legal cases."""
    print("Welcome to the Enhanced Legal Case Retrieval System!")
    print("Type 'exit' at any point to quit.\n")

    while True:
        # Loop until a valid choice is entered
        while True:
            print("Select a query type:")
            print("1. Search by Name")
            print("2. Search by Abbreviation")
            print("3. Search by Decision Date (YYYY-MM-DD): ")
            print("4. Search by Jurisdiction")
            print("5. Custom Query")
            choice = input("Enter choice (1-5): ").strip()

            if choice.lower() == "exit":
                print("Exiting the system. Goodbye!")
                return  # Exit the function entirely

            if choice in {"1", "2", "3", "4", "5"}:  # Valid choices
                break  # Exit the validation loop
            else:
                print("Invalid choice. Please enter a number between 1 and 5.")

        query = input("Enter your query: ").strip()
        if query.lower() == "exit":
            print("Exiting the system. Goodbye!")
            break

        query = preprocess_query(query)
        results = query_index(query, index, metadata)

        print("\nRetrieved Results:")
        for result in results:
            print(f"Rank: {result['rank']} - File: {result['file']} - Distance: {result['distance']}")
            print(f"Name: {result['name']}, Abbreviation: {result['abbreviation']}")
            print(f"Text Snippet: {result['text'][:200]}...\n")

        print("\nGenerating summary...")
        context = " ".join([result["text"] for result in results])
        summary = generate_summary(query, context)
        print(f"\nSummary:\n{summary}")

        print("\nWould you like to perform another query?")

In [23]:
index = faiss.read_index("legal_cases_index.faiss")
with open("metadata_faiss.json", "r") as f:
    metadata = json.load(f)

In [24]:
query_system(index, metadata)

Welcome to the Enhanced Legal Case Retrieval System!
Type 'exit' at any point to quit.

Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date (YYYY-MM-DD): 
4. Search by Jurisdiction
5. Custom Query


Enter choice (1-5):  3
Enter your query:  1807-10-16



Retrieved Results:
Rank: 1 - File: 0104-01.json - Distance: 50.14529037475586
Name: moody v the first bank of skagway, Abbreviation: moody v first bank of skagway
Text Snippet: BROWN, District Judge.
It seems,that the matter now before the court had its origin in the Commissioner’s Court, before C. A. Sehlbrede, United States Commissioner, on the nth day of March, 1899, in A...

Rank: 2 - File: 0439-01.json - Distance: 50.163719177246094
Name: the tyee consol min co v langstedt et al, Abbreviation: tyee consol min co v langstedt
Text Snippet: BROWN, District Judge.
In this case, and other cases involving practically the same question, there are about no defendants. The plaintiff is a foreign corporation that has complied with all the laws ...

Rank: 3 - File: 0361-01.json - Distance: 50.41318893432617
Name: brace v solner treasurer of nome, Abbreviation: brace v solner
Text Snippet: WICKERSHAM, District Judge.
In a former action this court had occasion to pass upon the power of the co

Enter choice (1-5):  5
Enter your query:  What about Colbert



Retrieved Results:
Rank: 1 - File: 0217-01.json - Distance: 43.98239517211914
Name: united states v alaska packers assn and babler, Abbreviation: united states v alaska packers assn
Text Snippet: BROWN, District Judge
(orally). In the case of the United States v. The Alaska Packers’ Association and J. Babler, an indictment has been returned by the grand jury, which, omitting the formal parts, ...

Rank: 2 - File: 0630-01.json - Distance: 44.141876220703125
Name: in re c e wynnjohnson, Abbreviation: in re wynnjohnson
Text Snippet: BROWN, District Judge.
At the Skagway October, 1901, term of the court, one C. E. Wynn-Johnson and the Moore’s Wharf Company were jointly indicted, under sections 460 and 461 of the crimes act of the ...

Rank: 3 - File: 0553-01.json - Distance: 44.245384216308594
Name: united states v binns, Abbreviation: united states v binns
Text Snippet: WICKERSHAM, District Judge.
The defendant was found guilty, and condemned to pay a fine and costs in the justice’s cour

Enter choice (1-5):  exit


Exiting the system. Goodbye!
