# Legal Case Indexing and Querying
- Generating embeddings using Hugging Face models
- Indexing using FAISS
- Querying indexed embeddings

## Section 1: Embedding Generation (Run outside `js2`)

In [1]:
!pip install transformers torch
import os
import json
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch



In [2]:
# Load embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def embed_text(text):
    """Generate embeddings for a given text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

2024-12-05 13:49:34.166777: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Directory containing JSON files
json_dir = "./json"
metadata = []
embeddings_list = []

# Process JSON files
for file_name in os.listdir(json_dir):
    if not file_name.endswith(".json"):
        continue

    file_path = os.path.join(json_dir, file_name)
    try:
        with open(file_path, "r") as f:
            data = json.load(f)
        
        # Extract and process opinions
        for opinion in data.get("casebody", {}).get("opinions", []):
            text = opinion.get("text", "")
            if not text:
                continue
            
            # Split long text into smaller passages
            passages = [text[i:i + 300] for i in range(0, len(text), 300)]
            for passage in passages:
                embedding = embed_text(passage)
                embeddings_list.append(embedding)
                metadata.append({"file": file_name, "text": passage})
    except Exception as e:
        print(f"Error processing {file_name}: {e}")


In [4]:
# Save embeddings and metadata
embeddings_array = np.vstack(embeddings_list)
np.save("embeddings.npy", embeddings_array)

with open("metadata_new.json", "w") as f:
    json.dump(metadata, f)
    
print("Embeddings and metadata saved!")

Embeddings and metadata saved!


## Section 2: Indexing and Querying (Run inside `js2`)

In [1]:
!pip install faiss-cpu
import faiss
import numpy as np
import json

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting numpy<3.0,>=1.25.0 (from faiss-cpu)
  Downloading numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m143.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m225.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: numpy, fai

In [4]:
embeddings = np.load("embeddings.npy", allow_pickle=True)

with open("metadata_new.json", "r") as f:
    metadata = json.load(f)

In [5]:
# Initialize FAISS index
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

In [7]:
# Save the FAISS index
faiss.write_index(index, "legal_cases_index.faiss")

In [8]:
# Query example
query_vector = embeddings[0]
k = 5  # Top-5 results
distances, indices = index.search(query_vector[np.newaxis, :], k)

print("Query results:")
for i, idx in enumerate(indices[0]):
    print(f"Rank {i+1}: {metadata[idx]}")

Query results:
Rank 1: {'file': '0001-01.json', 'text': 'DAWSON, District Judge.\nPetitioner, by his guardian, ad litem, sets forth that he is unlawfully restrained of his liberty by Lieutenant Commander J. S. Newell, naval officer in charge at this station, and in command of the United States steamer and man-of-war Pinta. He states that he was enlisted in'}
Rank 2: {'file': '0001-01.json', 'text': 'f inducing another person to contract with him, he is estopped from afterwards denying it.. See Bigelow on Estoppel, pp. 486, 487.\nIt follows that the prayer of the petitioner must be denied, and that he be remanded to the custody of Lieutenant Commander Newell and his successors until he is 21 year'}
Rank 3: {'file': '0407-01.json', 'text': 'lified terms of the requests. The plaintiff, as a sailor, was amenable to rigid discipline for disobedience of orders. He was injured while discharging a duty to which he had been assigned by his superior officer, and which he was performing under the

## Section 3: Querying the Indexed Embeddings

In [17]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [18]:
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
import json

In [19]:
# Load the embedding model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 1043, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.p

AttributeError: _ARRAY_API not found

RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):
numpy.core.multiarray failed to import

In [None]:
index = faiss.read_index("legal_cases_index.faiss")
with open("metadata_new.json", "r") as f:
    metadata = json.load(f)

In [10]:
def embed_text(text):
    """Generate embeddings for a given text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

In [None]:
# Query function
def query_index(user_query, index, metadata):
    query_embedding = embed_text(user_query)
    k = 5
    distances, indices = index.search(query_embedding, k)

    results = []
    for i, idx in enumerate(indices[0]):
        results.append({"rank": i+1, "file": metadata[idx]["file"], "text": metadata[idx]["text"], "distance": distances[0][i]})
    return results

In [None]:
# Example query
user_query = "What legal precedents are there for unlawful restraint?"
results = query_index(user_query, index, metadata)

In [None]:
# Display results
for result in results:
    print(f"Rank {result['rank']}:")
    print(f"File: {result['file']}")
    print(f"Text: {result['text']}")
    print(f"Distance: {result['distance']:.4f}\n")

In [None]:
with open("query_results.txt", "w") as f:
    for result in results:
        f.write(f"Rank {result['rank']}:\n")
        f.write(f"File: {result['file']}\n")
        f.write(f"Text: {result['text']}\n")
        f.write(f"Distance: {result['distance']:.4f}\n\n")