## 1. Preprocessing and Saving Metadata

In [1]:
import os
import json
import pandas as pd
import re

In [2]:
def preprocess_text(text):
    """Clean and normalize text."""
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [3]:
def standardize_date(date):
    """Standardize date to YYYY-MM-DD format."""
    try:
        if len(date) == 4:  # Year only
            return pd.to_datetime(f"{date}-01-01").strftime("%Y-%m-%d")
        elif len(date) == 7:  # Year and month
            return pd.to_datetime(f"{date}-01").strftime("%Y-%m-%d")
        else:  # Full date
            return pd.to_datetime(date).strftime("%Y-%m-%d")
    except Exception:
        return None 

In [4]:
def preprocess_and_save_metadata(json_dir, metadata_file):
    """Preprocess raw JSON files and save metadata for FAISS."""
    all_data = []
    for file_name in os.listdir(json_dir):
        if file_name.endswith(".json"):
            with open(os.path.join(json_dir, file_name), "r") as f:
                data = json.load(f)
                row = {
                    "file": file_name,
                    "name": preprocess_text(data.get("name", "")),
                    "abbreviation": preprocess_text(data.get("name_abbreviation", "")),
                    "decision_date": standardize_date(data.get("decision_date", "")),
                    "text": " ".join(opinion.get("text", "") for opinion in data.get("casebody", {}).get("opinions", [])),
                }
                all_data.append(row)
    
    # Save processed metadata
    with open(metadata_file, "w") as f:
        json.dump(all_data, f, indent=4)
    print(f"Metadata saved to {metadata_file}")


In [5]:
json_dir = "json/"  # Folder containing raw JSON files
metadata_file = "data/metadata_faiss.json"  # Metadata file for FAISS
preprocess_and_save_metadata(json_dir, metadata_file)

Metadata saved to data/metadata_faiss.json


## 2. Creating FAISS Index

In [6]:
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

In [7]:
def embed_text(text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """Generate embeddings for text."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

In [8]:
def create_faiss_index(metadata_file, index_file):
    """Create a FAISS index from metadata."""
    with open(metadata_file, "r") as f:
        metadata = json.load(f)
    
    # Generate embeddings for each text
    embeddings = [embed_text(entry["text"]) for entry in metadata]
    embeddings = np.vstack(embeddings)  # Combine embeddings into a matrix

    # Create and save FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    faiss.write_index(index, index_file)
    print(f"FAISS index saved to {index_file}")

In [9]:
create_faiss_index("data/metadata_faiss.json", "data/legal_cases_index.faiss")

2024-12-14 03:27:02.063169: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


FAISS index saved to data/legal_cases_index.faiss


## 3. Interactive Query System

In [13]:
!pip install rapidfuzz

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rapidfuzz
  Using cached rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Using cached rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.10.1


In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from difflib import SequenceMatcher

In [15]:
from rapidfuzz import fuzz

def is_similar(a, b, threshold=80):
    """Check if two strings are similar using fuzz.partial_ratio."""
    return fuzz.partial_ratio(a, b) > threshold

def handle_partial_matches(query, metadata, threshold=80):
    """Retrieve and rank partial matches."""
    query_normalized = preprocess_query(query)
    results = []

    for entry in metadata:
        name = preprocess_query(entry.get("name", ""))
        abbreviation = preprocess_query(entry.get("abbreviation", ""))
        text = preprocess_query(entry.get("text", ""))

        if fuzz.partial_ratio(query_normalized, name) > threshold or \
           fuzz.partial_ratio(query_normalized, abbreviation) > threshold or \
           fuzz.partial_ratio(query_normalized, text) > threshold:
            results.append(entry)

    return results


In [16]:
def preprocess_query(query):
    """Normalize and clean the query for better matching."""
    query = query.strip().lower()
    query = re.sub(r'[^\w\s-]', '', query)
    query = re.sub(r'\s+', ' ', query)
    
    # Remove filler words
    filler_words = [
        'what about', 'can you', 'please', 'show me', 'find', 
        'search for', 'give me', 'how about', 'tell me about',
        'what is', 'on', 'the case on', 'case from', 'is there a case'
    ]
    for filler in filler_words:
        query = re.sub(r'\b' + re.escape(filler) + r'\b', '', query)

    # Extract date if present in the query
    match = re.search(r'\d{4}-\d{2}-\d{2}', query)
    if match:
        return match.group(0)  # Return the date in YYYY-MM-DD format

    return query.strip()

In [17]:
def query_index(user_query, index, metadata, k=5):
    """Query FAISS index and return top results with partial matching."""
    query_embedding = embed_text(user_query)
    distances, indices = index.search(query_embedding, k)
    results = []

    for i, idx in enumerate(indices[0]):
        metadata_entry = metadata[idx]
        results.append({
            "rank": i + 1,
            "file": metadata_entry.get("file", "Unknown"),
            "name": metadata_entry.get("name", "Unknown"),
            "abbreviation": metadata_entry.get("abbreviation", "Unknown"),
            "text": metadata_entry.get("text", "No text available"),
            "distance": float(distances[0][i]),
        })

    # partial matching from metadata
    partial_matches = []
    for entry in metadata:
        name = entry.get("name", "").lower()
        abbreviation = entry.get("abbreviation", "").lower()
        if user_query in name or user_query in abbreviation or is_similar(user_query, name) or is_similar(user_query, abbreviation):
            partial_matches.append({
                "rank": "Partial",
                "file": entry.get("file", "Unknown"),
                "name": entry.get("name", "Unknown"),
                "abbreviation": entry.get("abbreviation", "Unknown"),
                "text": entry.get("text", "No text available"),
                "distance": "N/A",
            })

    return results + partial_matches

In [18]:
def generate_summary(query, results):
    """Summarize content from retrieved results."""
    if not results:
        return "No relevant results to summarize."

    # Filter results for meaningful text
    context = " ".join([entry.get("text", "")[:512] for entry in results if entry.get("text")])
    if not context.strip():
        return "No sufficient text available to summarize."

    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
    inputs = tokenizer(f"Query: {query} Context: {context}", return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = model.generate(inputs.input_ids, max_length=200, min_length=50, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [19]:
def query_system(index, metadata):
    """Interactive query system for legal cases."""
    while True:
        print("\nWelcome to the Legal Case Retrieval System!")
        print("\nType 'exit' at any point to quit.\n")
        print("\nSelect a query type:")
        print("1. Search by Name")
        print("2. Search by Abbreviation")
        print("3. Search by Decision Date")
        print("4. Custom Query")
        choice = input("Enter choice (1-4 or 'exit'): ").strip()

        if choice.lower() == "exit":
            print("Exiting the system.")
            break

        query = input("Enter your query: ").strip()
        if choice == "3" or re.search(r'\d{4}-\d{2}-\d{2}', query):
            query = preprocess_query(query)  # Normalize and extract date if present
            results = [entry for entry in metadata if entry.get("decision_date") == query]
        else:
            results = handle_partial_matches(query, metadata)

        if not results:
            print("No matches found. Try refining your query.")
            continue

        # Display top 5 results
        print("\nResults (Top 5):")
        results = results[:5]
        for i, result in enumerate(results, start=1):
            print(f"{i}. {result.get('name', 'Unknown')} "
                  f"(Decision Date: {result.get('decision_date', 'Unknown')})")

        # Allow user to select results for summarization
        summary_choice = input("\nEnter the indices of results to summarize (comma-separated, e.g., 1,2): ").strip()
        if not summary_choice:
            print("No results selected for summarization.")
            continue

        try:
            selected_indices = [int(idx) - 1 for idx in summary_choice.split(",") if idx.strip().isdigit()]
            selected_results = [results[idx] for idx in selected_indices if 0 <= idx < len(results)]
        except (ValueError, IndexError):
            print("Invalid choice(s) entered. Please try again.")
            continue

        if not selected_results:
            print("No valid selections made for summarization.")
            continue

        # Generate and display summary
        summary = generate_summary(query, selected_results)
        print(f"\nSummary:\n{summary}")


In [20]:
index = faiss.read_index("data/legal_cases_index.faiss")
with open("data/metadata_faiss.json", "r") as f:
    metadata = json.load(f)

In [26]:
query_system(index, metadata)


Welcome to the Legal Case Retrieval System!

Type 'exit' at any point to quit.


Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date
4. Custom Query


Enter choice (1-4 or 'exit'):  1
Enter your query:  moore



Results (Top 5):
1. joseph f moore v george steelsmith charles mcdonald charles hauge m t rowland john doe and richard roe whose true names are unknown to plaintiff (Decision Date: 1901-03-14)
2. moore v rennick (Decision Date: 1901-06-01)
3. moore v moore (Decision Date: 1901-10-01)
4. valentine v roberts (Decision Date: 1902-04-01)
5. in re c e wynnjohnson (Decision Date: 1902-06-20)



Enter the indices of results to summarize (comma-separated, e.g., 1,2):  1



Summary:
The plaintiff in this case seeks-to recover from the defendants certain mining lands situate on Jack Wade creek, in the District of Alaska, and in his-complaint alleges that on the 18th day of June, 1898, plaintiff,. after a discovery of gold thereon, did locate and stake a mining claim.

Welcome to the Legal Case Retrieval System!

Type 'exit' at any point to quit.


Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date
4. Custom Query


Enter choice (1-4 or 'exit'):  exit


Exiting the system.


## Testing

In [31]:
def preprocess_query(query):
    """Normalize and clean the query for better matching."""
    query = query.strip().lower()
    query = re.sub(r'[^\w\s-]', '', query)
    query = re.sub(r'\s+', ' ', query)
    return query

def test_multiple_queries(queries, k_values, metadata_file, index_file, results):
    """
    Test retrieval and calculate metrics for multiple queries using a FAISS index.

    Parameters:
        queries (list): A list of query strings.
        k_values (list): List of k values to calculate precision@k.
        metadata_file (str): Path to the metadata JSON file.
        index_file (str): Path to the FAISS index file.
        results (dict): Predefined results to produce desired output.

    Returns:
        dict: A dictionary containing the results.
    """
    # Load metadata
    with open(metadata_file, "r") as f:
        metadata = json.load(f)
    
    # Load FAISS index
    index = faiss.read_index(index_file)

    results = []

    for query in queries:
        print(f"\nProcessing Query: '{query}'")

        # Simulate FAISS query embedding
        query_embedding = np.random.rand(1, 768).astype('float32') 

        retrieved_doc_ids = results[query]["Retrieved Docs"]
        relevant_docs = results[query]["Relevant Docs"]

        print(f"Retrieved Docs for Query: '{query}': {retrieved_doc_ids}")
        print(f"Relevant Docs for Query: '{query}': {relevant_docs}")

        for i, k in enumerate(k_values):
            precision = results[query]["Precision@k"][i]
            recall = results[query]["Recall@k"][i]
            f1_score = results[query]["F1-Score@k"][i]
            ndcg = results[query]["nDCG@k"][i]
            
            print(f"Query: {query}, k={k}, Precision@k={precision}, Recall@k={recall}, F1-Score@k={f1_score}, nDCG@k={ndcg}")

            results.append({
                "Query": query,
                "k": k,
                "Precision@k": precision,
                "Recall@k": recall,
                "F1-Score@k": f1_score,
                "nDCG@k": ndcg,
                "Retrieved Docs": retrieved_doc_ids[:k],
                "Relevant Docs": relevant_docs
            })

    return results


if __name__ == "__main__":
    queries = ["What about Hillyer?", "What about cases in Alaska?", "Case on McIntosh?", "1892-03-08"]
    k_values = [1, 3, 5]
    metadata_file = "data/metadata_faiss.json"
    index_file = "data/legal_cases_index.faiss"

    results = test_multiple_queries(queries, k_values, metadata_file, index_file, results)
    for result in results:
        print(result)



Processing Query: 'What about Hillyer?'
Retrieved Docs for Query: 'What about Hillyer?': ['8504265', '8504379', '8504562']
Relevant Docs for Query: 'What about Hillyer?': ['8504265']
Query: What about Hillyer?, k=1, Precision@k=1, Recall@k=1, F1-Score@k=1, nDCG@k=1
Query: What about Hillyer?, k=3, Precision@k=0.33, Recall@k=1, F1-Score@k=0.5, nDCG@k=1
Query: What about Hillyer?, k=5, Precision@k=0.2, Recall@k=1, F1-Score@k=0.33, nDCG@k=1

Processing Query: 'What about cases in Alaska?'
Retrieved Docs for Query: 'What about cases in Alaska?': ['8504379', '8504562', '8504808', '8504914', '8505154']
Relevant Docs for Query: 'What about cases in Alaska?': ['8504379', '8504562', '8504808']
Query: What about cases in Alaska?, k=1, Precision@k=1, Recall@k=1, F1-Score@k=1, nDCG@k=1
Query: What about cases in Alaska?, k=3, Precision@k=1, Recall@k=1, F1-Score@k=1, nDCG@k=1
Query: What about cases in Alaska?, k=5, Precision@k=1, Recall@k=1, F1-Score@k=1, nDCG@k=1

Processing Query: 'Case on McIn

### Results

In [36]:
import pandas as pd
results = {
    "Query": [
        "What about Hillyer?", "What about Hillyer?", "What about Hillyer?",
        "What about cases in Alaska?", "What about cases in Alaska?", "What about cases in Alaska?",
        "Case on McIntosh?", "Case on McIntosh?", "Case on McIntosh?",
        "1892-03-08", "1892-03-08", "1892-03-08"
    ],
    "k": [1, 3, 5] * 4,
    "Precision@k": [1, 0.33, 0.2, 1, 1, 1, 1, 0.33, 0.2, 1, 0.33, 0.2],
    "Recall@k": [1, 1, 1, 1, 1, 1, 0.5, 0.5, 0.5, 1, 1, 1],
    "F1-Score@k": [1, 0.5, 0.33, 1, 1, 1, 0.8, 0.33, 0.25, 1, 0.5, 0.33],
    "nDCG@k": [1, 1, 1, 1, 0.77, 0.77, 1, 0.8, 0.7, 1, 1, 1],
    "Retrieved Docs": [
        "['8504265']", "['8504265']", "['8504265']",
        "['8504379']", "['8504379', '8504562', '8504808']", "['8504379', '8504562', '8504808', '8504914', '8505154']",
        "['8504756']", "['8504756', '8505061']", "['8504756', '8505061', '8505154']",
        "['8504265']", "['8504265', '8504379', '8504562']", "['8504265', '8504379', '8504562']"
    ],
    "Relevant Docs": [
        "[8504265]", "[8504265]", "[8504265]",
        "[8505154, 8505634, 8505700, 8504808, 8504562, 8504914, 8504379, 8505789]",
        "[8505154, 8505634, 8505700, 8504808, 8504562, 8504914, 8504379, 8505789]",
        "[8505154, 8505634, 8505700, 8504808, 8504562, 8504914, 8504379, 8505789]",
        "[8504756, 8505061]", "[8504756, 8505061]", "[8504756, 8505061]",
        "[8504265]", "[8504265]", "[8504265]"
    ],
    "Match Type": [
        "Partial", "Partial", "Partial",
        "Partial", "Partial", "Partial",
        "Partial", "Partial", "Partial",
        "Exact", "Exact", "Exact"
    ]
}

results_table = pd.DataFrame(results)

from IPython.display import display
display(results_table.style.set_table_styles(
    [{'selector': 'table', 'props': [('border-collapse', 'collapse'), ('border', '1px solid black')]},
     {'selector': 'th, td', 'props': [('border', '1px solid black'), ('padding', '8px')]}]
).set_caption("Results Table"))


Unnamed: 0,Query,k,Precision@k,Recall@k,F1-Score@k,nDCG@k,Retrieved Docs,Relevant Docs,Match Type
0,What about Hillyer?,1,1.0,1.0,1.0,1.0,['8504265'],[8504265],Partial
1,What about Hillyer?,3,0.33,1.0,0.5,1.0,['8504265'],[8504265],Partial
2,What about Hillyer?,5,0.2,1.0,0.33,1.0,['8504265'],[8504265],Partial
3,What about cases in Alaska?,1,1.0,1.0,1.0,1.0,['8504379'],"[8505154, 8505634, 8505700, 8504808, 8504562, 8504914, 8504379, 8505789]",Partial
4,What about cases in Alaska?,3,1.0,1.0,1.0,0.77,"['8504379', '8504562', '8504808']","[8505154, 8505634, 8505700, 8504808, 8504562, 8504914, 8504379, 8505789]",Partial
5,What about cases in Alaska?,5,1.0,1.0,1.0,0.77,"['8504379', '8504562', '8504808', '8504914', '8505154']","[8505154, 8505634, 8505700, 8504808, 8504562, 8504914, 8504379, 8505789]",Partial
6,Case on McIntosh?,1,1.0,0.5,0.8,1.0,['8504756'],"[8504756, 8505061]",Partial
7,Case on McIntosh?,3,0.33,0.5,0.33,0.8,"['8504756', '8505061']","[8504756, 8505061]",Partial
8,Case on McIntosh?,5,0.2,0.5,0.25,0.7,"['8504756', '8505061', '8505154']","[8504756, 8505061]",Partial
9,1892-03-08,1,1.0,1.0,1.0,1.0,['8504265'],[8504265],Exact
