## 1. Preprocessing and Saving Metadata

In [1]:
import os
import json
import pandas as pd
import re

In [2]:
def preprocess_text(text):
    """Clean and normalize text."""
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [3]:
def standardize_date(date):
    """Standardize date to YYYY-MM-DD format."""
    try:
        if len(date) == 4:  # Year only
            return pd.to_datetime(f"{date}-01-01").strftime("%Y-%m-%d")
        elif len(date) == 7:  # Year and month
            return pd.to_datetime(f"{date}-01").strftime("%Y-%m-%d")
        else:  # Full date
            return pd.to_datetime(date).strftime("%Y-%m-%d")
    except Exception:
        return None 

In [4]:
def preprocess_and_save_metadata(json_dir, metadata_file):
    """Preprocess raw JSON files and save metadata for FAISS."""
    all_data = []
    for file_name in os.listdir(json_dir):
        if file_name.endswith(".json"):
            with open(os.path.join(json_dir, file_name), "r") as f:
                data = json.load(f)
                row = {
                    "file": file_name,
                    "name": preprocess_text(data.get("name", "")),
                    "abbreviation": preprocess_text(data.get("name_abbreviation", "")),
                    "decision_date": standardize_date(data.get("decision_date", "")),
                    "text": " ".join(opinion.get("text", "") for opinion in data.get("casebody", {}).get("opinions", [])),
                }
                all_data.append(row)
    
    # Save processed metadata
    with open(metadata_file, "w") as f:
        json.dump(all_data, f, indent=4)
    print(f"Metadata saved to {metadata_file}")


In [5]:
json_dir = "json/"  # Folder containing raw JSON files
metadata_file = "data/metadata_faiss.json"  # Metadata file for FAISS
preprocess_and_save_metadata(json_dir, metadata_file)

Metadata saved to data/metadata_faiss.json


## 2. Creating FAISS Index

In [6]:
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

In [7]:
def embed_text(text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """Generate embeddings for text."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

In [8]:
def create_faiss_index(metadata_file, index_file):
    """Create a FAISS index from metadata."""
    with open(metadata_file, "r") as f:
        metadata = json.load(f)
    
    # Generate embeddings for each text
    embeddings = [embed_text(entry["text"]) for entry in metadata]
    embeddings = np.vstack(embeddings)  # Combine embeddings into a matrix

    # Create and save FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    faiss.write_index(index, index_file)
    print(f"FAISS index saved to {index_file}")

In [9]:
create_faiss_index("data/metadata_faiss.json", "data/legal_cases_index.faiss")

2024-12-09 11:55:38.865297: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


FAISS index saved to data/legal_cases_index.faiss


## 3. Interactive Query System

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from difflib import SequenceMatcher

In [11]:
def is_similar(a, b, threshold=0.8):
    """Check if two strings are similar using SequenceMatcher."""
    return SequenceMatcher(None, a, b).ratio() > threshold

In [12]:
def preprocess_query(query):
    """Normalize and preprocess user query."""
    query = query.lower().strip()
    query = re.sub(r"[^\w\s]", "", query)
    query = re.sub(r"\s+", " ", query)
    return query

In [13]:
def query_index(user_query, index, metadata, k=5):
    """Query FAISS index and return top results with partial matching."""
    query_embedding = embed_text(user_query)
    distances, indices = index.search(query_embedding, k)
    results = []

    for i, idx in enumerate(indices[0]):
        metadata_entry = metadata[idx]
        results.append({
            "rank": i + 1,
            "file": metadata_entry.get("file", "Unknown"),
            "name": metadata_entry.get("name", "Unknown"),
            "abbreviation": metadata_entry.get("abbreviation", "Unknown"),
            "text": metadata_entry.get("text", "No text available"),
            "distance": float(distances[0][i]),
        })

    # partial matching from metadata
    partial_matches = []
    for entry in metadata:
        name = entry.get("name", "").lower()
        abbreviation = entry.get("abbreviation", "").lower()
        if user_query in name or user_query in abbreviation or is_similar(user_query, name) or is_similar(user_query, abbreviation):
            partial_matches.append({
                "rank": "Partial",
                "file": entry.get("file", "Unknown"),
                "name": entry.get("name", "Unknown"),
                "abbreviation": entry.get("abbreviation", "Unknown"),
                "text": entry.get("text", "No text available"),
                "distance": "N/A",
            })

    return results + partial_matches

In [14]:
def generate_summary(query, context, model_name="facebook/bart-large-cnn"):
    """Summarize context using BART."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    inputs = tokenizer(f"Query: {query} Context: {context}", return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = model.generate(inputs.input_ids, max_length=200, min_length=50, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [15]:
def query_system(index, metadata):
    """Interactive query system for retrieving legal cases."""
    print("Welcome to the Enhanced Legal Case Retrieval System!")
    print("Type 'exit' at any point to quit.\n")

    while True:
        # Loop until a valid choice is entered
        while True:
            print("Select a query type:")
            print("1. Search by Name")
            print("2. Search by Abbreviation")
            print("3. Search by Decision Date (YYYY-MM-DD): ")
            print("4. Search by Jurisdiction")
            print("5. Custom Query")
            choice = input("Enter choice (1-5): ").strip()

            if choice.lower() == "exit":
                print("Exiting the system. Goodbye!")
                return  # Exit the function entirely

            if choice in {"1", "2", "3", "4", "5"}:  # Valid choices
                break  # Exit the validation loop
            else:
                print("Invalid choice. Please enter a number between 1 and 5.")

        query = input("Enter your query: ").strip()
        if query.lower() == "exit":
            print("Exiting the system. Goodbye!")
            break

        query = preprocess_query(query)
        results = query_index(query, index, metadata)

        print("\nRetrieved Results:")
        for result in results:
            print(f"Rank: {result['rank']} - File: {result['file']} - Distance: {result['distance']}")
            print(f"Name: {result['name']}, Abbreviation: {result['abbreviation']}")
            print(f"Text Snippet: {result['text'][:200]}...\n")

        print("\nGenerating summary...")
        context = " ".join([result["text"] for result in results])
        summary = generate_summary(query, context)
        print(f"\nSummary:\n{summary}")

        print("\nWould you like to perform another query?")

In [17]:
index = faiss.read_index("data/legal_cases_index.faiss")
with open("data/metadata_faiss.json", "r") as f:
    metadata = json.load(f)

In [18]:
query_system(index, metadata)

Welcome to the Enhanced Legal Case Retrieval System!
Type 'exit' at any point to quit.

Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date (YYYY-MM-DD): 
4. Search by Jurisdiction
5. Custom Query


Enter choice (1-5):  1
Enter your query:  pratt



Retrieved Results:
Rank: 1 - File: 0005-01.json - Distance: 69.22002410888672
Name: united states v the northwest trading co et al, Abbreviation: united states v northwest trading co
Text Snippet: DAWSON, District Judge.
On June 6, 1888, the United States District Attorney for the District of Alaska, representing the United States, filed his petition, in which it is alleged, in substance, that ...

Rank: 2 - File: 0264-01.json - Distance: 69.25569915771484
Name: heman v griffith et al, Abbreviation: heman v griffith
Text Snippet: WICKERSHAM, District Judge.
The court is relieved from the consideration of some questions in this case by having determined similar ones in the case of Steen v. The Wild Goose Mining Co., ante, 255, ...

Rank: 3 - File: 0664-01.json - Distance: 69.75025939941406
Name: american gold min co v giant powder co et al, Abbreviation: american gold min co v giant powder co
Text Snippet: BROWN, District Judge.
The laws of the District of Alaska provide how summons sh

Enter choice (1-5):  3
Enter your query:  1807-10-17



Retrieved Results:
Rank: 1 - File: 0439-01.json - Distance: 46.82063293457031
Name: the tyee consol min co v langstedt et al, Abbreviation: tyee consol min co v langstedt
Text Snippet: BROWN, District Judge.
In this case, and other cases involving practically the same question, there are about no defendants. The plaintiff is a foreign corporation that has complied with all the laws ...

Rank: 2 - File: 0361-01.json - Distance: 46.91706848144531
Name: brace v solner treasurer of nome, Abbreviation: brace v solner
Text Snippet: WICKERSHAM, District Judge.
In a former action this court had occasion to pass upon the power of the council to expend this fund in payment of salaries to the town clerk and treasurer, and in denying ...

Rank: 3 - File: 0104-01.json - Distance: 46.97916030883789
Name: moody v the first bank of skagway, Abbreviation: moody v first bank of skagway
Text Snippet: BROWN, District Judge.
It seems,that the matter now before the court had its origin in the Commissioner’

Enter choice (1-5):  4
Enter your query:  alaska



Retrieved Results:
Rank: 1 - File: 0005-01.json - Distance: 52.78784942626953
Name: united states v the northwest trading co et al, Abbreviation: united states v northwest trading co
Text Snippet: DAWSON, District Judge.
On June 6, 1888, the United States District Attorney for the District of Alaska, representing the United States, filed his petition, in which it is alleged, in substance, that ...

Rank: 2 - File: 0286-01.json - Distance: 53.47654724121094
Name: price et al v mcintosh et al, Abbreviation: price v mcintosh
Text Snippet: WICKERSHAM, District Judge
(after stating the facts as above). Under the admitted facts in this case these questions arise for determination by the court: (i) Was Thorolf Kjelsberg’s placer location v...

Rank: 3 - File: 0111-01.json - Distance: 53.93035125732422
Name: in re burton, Abbreviation: in re burton
Text Snippet: BROWN, District Judge.
The petition of Samuel Burton states that he is a native of British Columbia, now and for many years a reside

Enter choice (1-5):  3
Enter your query:  1991-10-25



Retrieved Results:
Rank: 1 - File: 0104-01.json - Distance: 39.51934814453125
Name: moody v the first bank of skagway, Abbreviation: moody v first bank of skagway
Text Snippet: BROWN, District Judge.
It seems,that the matter now before the court had its origin in the Commissioner’s Court, before C. A. Sehlbrede, United States Commissioner, on the nth day of March, 1899, in A...

Rank: 2 - File: 0598-01.json - Distance: 40.244163513183594
Name: ames v kruzner et al, Abbreviation: ames v kruzner
Text Snippet: WICKERSHAM, District Judge.
On October 16, 1900, the defendants, Kruzner and Woodruff, at Nome, made and delivered their promissory note in the sum of $1,514.90 to the Ames Mercantile Company, a forei...

Rank: 3 - File: 0264-01.json - Distance: 40.920753479003906
Name: heman v griffith et al, Abbreviation: heman v griffith
Text Snippet: WICKERSHAM, District Judge.
The court is relieved from the consideration of some questions in this case by having determined similar ones in the c

Enter choice (1-5):  exit


Exiting the system. Goodbye!
