In [1]:
!pip install torch transformers llama-index scikit-learn numpy


Collecting llama-index
  Downloading llama_index-0.12.3-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.0-py3-none-any.whl.metadata (726 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_cli-0.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.3 (from llama-index)
  Downloading llama_index_core-0.12.3-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.3-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_index_legacy-0.9.48.post4-py3-none-any.whl.metadata (8.5 kB)
Collecting 

In [32]:
# json to csv

import os
import json
import pandas as pd
def json_to_csv(json_dir, output_csv):
    """
    Converts all JSON files in a directory into a single CSV file.

    Parameters:
        json_dir (str): Path to the directory containing JSON files.
        output_csv (str): Path to save the output CSV file.

    Returns:
        None
    """
    all_data = []

    # Iterate over all JSON files in the directory
    for file_name in os.listdir(json_dir):
        if file_name.endswith(".json"):
            file_path = os.path.join(json_dir, file_name)
            with open(file_path, "r") as file:
                # Load the JSON data
                data = json.load(file)
                
                # Flatten the JSON structure and extract relevant data
                row = {
                    "id": data.get("id"),
                    "name": data.get("name"),
                    "abbreviation": data.get("name_abbreviation"),
                    "decision_date": data.get("decision_date"),
                    "court_name": data.get("court", {}).get("name"),
                    "jurisdiction_name": data.get("jurisdiction", {}).get("name"),
                    "word_count": data.get("analysis", {}).get("word_count"),
                    "char_count": data.get("analysis", {}).get("char_count"),
                    "ocr_confidence": data.get("analysis", {}).get("ocr_confidence"),
                    "case_text": " ".join([opinion["text"] for opinion in data.get("casebody", {}).get("opinions", [])]),
                }
                all_data.append(row)
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)

    print(f"CSV file saved at: {output_csv}")

# Specify the path to the JSON directory and output CSV file
json_dir = "json/"
output_csv = "output_cases.csv"

# Convert JSON files to CSV
json_to_csv(json_dir, output_csv)


CSV file saved at: output_cases.csv


In [52]:
import os
import json

def load_and_inspect_json(folder_path):
    """Inspect the structure of the first JSON file to debug the issue."""
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            with open(os.path.join(folder_path, file_name), 'r') as f:
                data = json.load(f)
                print(f"File: {file_name}")
                print(json.dumps(data, indent=4))  # Pretty print the JSON structure
                break  # Stop after inspecting the first file

# Set the folder path to your JSON directory
folder_path = "json/"
load_and_inspect_json(folder_path)


File: 0001-01.json
{
    "id": 8503986,
    "name": "In re JESSE SCOTT OLIVER, Minor",
    "name_abbreviation": "In re Oliver",
    "decision_date": "1887-10-31",
    "docket_number": "No. 95",
    "first_page": "1",
    "last_page": "4",
    "citations": [
        {
            "type": "official",
            "cite": "1 Alaska 1"
        }
    ],
    "court": {
        "name_abbreviation": "Alaska Dist. Ct.",
        "id": 23837,
        "name": "Alaska District Court"
    },
    "jurisdiction": {
        "id": 53,
        "name_long": "Alaska",
        "name": "Alaska"
    },
    "cites_to": [
        {
            "cite": "6 Am. Dec. 156",
            "category": "reporters:federal",
            "reporter": "Am. Dec.",
            "opinion_index": 0
        },
        {
            "cite": "11 Mass. 67",
            "category": "reporters:state",
            "reporter": "Mass.",
            "case_ids": [
                2053436
            ],
            "opinion_index": 0,
        

In [53]:
import os
import json
import re
import numpy as np
import os
import json
import re
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics.pairwise import cosine_similarity

# Data Loading
def load_json_files(folder_path):
    """Load all JSON files from a given folder into a list of dictionaries."""
    all_data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            with open(os.path.join(folder_path, file_name), 'r') as f:
                data = json.load(f)
                all_data.append(data)
    return all_data

# Preprocessing Functions
def preprocess_case_text(text):
    """Clean and standardize case text."""
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s.,;:]', '', text)  # Remove special characters
    return text.strip()

def preprocess_data_with_casebody(data):
    """Preprocess data by cleaning text and extracting detailed case text."""
    preprocessed_data = []
    for case in data:
        # Extract detailed text from 'casebody > opinions > text'
        casebody_opinions = case.get("casebody", {}).get("opinions", [])
        detailed_text = " ".join(opinion.get("text", "") for opinion in casebody_opinions)

        processed_case = {
            "id": case.get("id"),
            "name": case.get("name", "").strip(),
            "abbreviation": case.get("name_abbreviation", "").strip(),
            "decision_date": case.get("decision_date", "").strip(),
            "jurisdiction": case.get("jurisdiction", {}).get("name", "").strip(),
            "cleaned_text": preprocess_case_text(detailed_text) if detailed_text else "No text available",
        }
        preprocessed_data.append(processed_case)
    return preprocessed_data

# Main Execution
if __name__ == "__main__":
    folder_path = "json/"  # Path to your folder containing JSON files
    data = load_json_files(folder_path)  # Load JSON files
    preprocessed_data = preprocess_data_with_casebody(data)  # Preprocess data
    
    # Save to metadata file for further processing
    output_metadata_file = "data/metadata.json"
    with open(output_metadata_file, "w") as f:
        json.dump(preprocessed_data, f)

    print(f"Updated metadata saved to {output_metadata_file}")


Updated metadata saved to data/metadata.json


In [54]:
# ColBERT Class
# ColBERT Class
class ColBERT:
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        self.model = BertModel.from_pretrained(pretrained_model_name)
        self.model.eval()

    def generate_embeddings(self, text):
        """Generate dense embeddings for a given text."""
        tokens = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**tokens)
            token_embeddings = outputs.last_hidden_state.squeeze(0)
            mask = tokens['attention_mask'].squeeze(0).bool()
            return token_embeddings[mask].numpy()

       

In [55]:
# Query Preprocessing
def preprocess_query(query):
    """Normalize and clean the query for better matching."""
    query = re.sub(r'\s+', ' ', query)  # Remove extra whitespace
    query = re.sub(r'[^\w\s.,;:]', '', query)  # Remove special characters
    query = query.strip().lower()

    # List of common filler words or phrases to remove
    filler_words = [
        'what about', 'can you', 'could you', 'please', 'tell me', 
        'show me', 'find', 'search for', 'give me', 'how about', 
        'do you know', 'any info on', 'what is', 'can you tell me about', 
        'let me know', 'is there', 'is it', 'is this', 'i want to know', 
        'i am looking for', 'can you find', 'what is the status of', 
        'what do you know', 'have you heard of'
    ]
    
    # Remove common filler words or phrases
    for filler in filler_words:
        query = re.sub(r'\b' + re.escape(filler) + r'\b', '', query)

    # Normalize "v." to "v" and "vs" to "v" for consistency
    query = query.replace('v.', 'v').replace('vs', 'v')

    # Clean up any remaining extra whitespace after removing filler words
    query = re.sub(r'\s+', ' ', query).strip()

    return query



In [56]:
# Update in `colbert_retrieve` for enhanced exact match
def colbert_retrieve(query, embeddings_file, metadata_file, top_k=5):
    embeddings = np.load(embeddings_file, allow_pickle=True)
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)

    colbert = ColBERT()
    query = preprocess_query(query)  # Normalize the query
    query_tokens = colbert.tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Generate query embeddings
    with torch.no_grad():
        query_outputs = colbert.model(**query_tokens)
        query_embeddings = query_outputs.last_hidden_state.squeeze(0)
        mask = query_tokens['attention_mask'].squeeze(0).bool()
        query_embeddings = query_embeddings[mask].numpy()

    # Initialize variables to store scores and filtered results
    scores = []
    filtered_results = []

    # Exact matching boost
    exact_match_weight = 50

    for i, doc in enumerate(metadata):
        # Extract case name and abbreviation from metadata
        exact_match_score = 0
        case_name = doc["name"].lower() if doc.get("name") else ""
        abbreviation = doc["abbreviation"].lower() if doc.get("abbreviation") else ""

        # Debug print for document details
        #print(f"Document {i}: Case Name: {case_name} | Abbreviation: {abbreviation}")

        # Normalize case names and abbreviations for exact match
        # Normalize case names and abbreviations for exact match
        case_name = case_name.replace('v.', 'v').replace('V.', 'v').replace('vs', 'v')
        abbreviation = abbreviation.replace('v.', 'v').replace('V.', 'v').replace('vs', 'v')

        query = query.replace('v.', 'v').replace('V.', 'v').replace('vs', 'v')


        # Exact match check
        if query == case_name or query == abbreviation:
            exact_match_score = exact_match_weight
            print(f"Exact Match Found: {case_name}")
            filtered_results.append(doc)
        
        # Document embeddings
        doc_embeddings = embeddings[i]
        
        # Cosine similarity calculation between query and document embeddings
        similarity_matrix = cosine_similarity(query_embeddings, doc_embeddings)
        max_similarities = similarity_matrix.max(axis=1)
        embedding_score = max_similarities.sum()

        # Final score: combining exact match score and embedding similarity score
        final_score = exact_match_score + embedding_score
        scores.append((final_score, i))

    # Sort the scores in descending order
    scores = sorted(scores, key=lambda x: x[0], reverse=True)

    # Generate results based on the sorted scores
    results = [
        {
            "id": metadata[i]["id"],
            "name": metadata[i]["name"],
            "abbreviation": metadata[i]["abbreviation"],
            "decision_date": metadata[i]["decision_date"],
            "jurisdiction": metadata[i]["jurisdiction"],
            "cleaned_text": metadata[i].get("cleaned_text", "No text available"),
            "score": final_score,
        }
        for final_score, i in scores[:top_k]
    ]

    return results, filtered_results


In [57]:
def generate_summary(query, retrieved_docs):
    """Generate a summary using RAG for the most relevant content."""
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

    # Combine cleaned_text of retrieved documents
    context = " ".join([doc.get('cleaned_text', '') for doc in retrieved_docs if doc.get('cleaned_text')])

    if not context.strip():
        return "No relevant document content found for summarization."

    # Prepare input for the summarization model
    input_text = f"Query: {query} Context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs.input_ids, max_length=200, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary



In [None]:
# Query System
def query_system():
    print("Select a query type:")
    print("1. Search by Name")
    print("2. Search by Abbreviation")
    print("3. Search by Decision Date")
    print("4. Search by Jurisdiction")
    print("5. Custom Legal Query")

    choice = input("Enter choice (1-5): ").strip()
    query = ""

    if choice == "1":
        query = input("Enter case name: ").strip()
    elif choice == "2":
        query = input("Enter case abbreviation: ").strip()
    elif choice == "3":
        query = input("Enter decision date (YYYY-MM-DD): ").strip()
    elif choice == "4":
        query = input("Enter jurisdiction: ").strip()
    elif choice == "5":
        query = input("Enter your query: ").strip()
    else:
        print("Invalid choice. Exiting.")
        return

    results, filtered_results = colbert_retrieve(query, "data/embeddings.npy", "data/metadata.json")

    # Print results
    print("\nRetrieved Cleaned Texts (or Detailed Texts):")
    for res in results:
        print(f"Document ID: {res['id']}, Name: {res['name']}, Cleaned Text Snippet: {res['cleaned_text'][:100]}...")

    print("\nTop results:")
    for res in results:
        print(f"ID: {res['id']}, Name: {res['name']}, Score: {res['score']:.4f}")

    # Advanced summary if any results
    if results:
        print("\nGenerating advanced summary...\n")
        advanced_summary = generate_summary(query, results)
        print("Generated Summary:")
        print(advanced_summary)
    else:
        print("No relevant documents found. Refine your query.")


# Main Execution
if __name__ == "__main__":
    query_system()

Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date
4. Search by Jurisdiction
5. Custom Legal Query


Enter choice (1-5):  1
Enter case name:  Noon



Retrieved Cleaned Texts (or Detailed Texts):
Document ID: 8503986, Name: In re JESSE SCOTT OLIVER, Minor, Cleaned Text Snippet: DAWSON, District Judge. Petitioner, by his guardian, ad litem, sets forth that he is unlawfully rest...
Document ID: 8504008, Name: UNITED STATES v. THE NORTH-WEST TRADING CO. et al., Cleaned Text Snippet: DAWSON, District Judge. On June 6, 1888, the United States District Attorney for the District of Ala...
Document ID: 8504024, Name: MYERS v. SWINEFORD, Cleaned Text Snippet: DAAVSON, District Judge. This was an action of assumpsit, brought by plaintiff against the defendant...
Document ID: 8504052, Name: Ex parte DUBUQUE, Cleaned Text Snippet: KEATEEY, District Judge. It appears that on the 2ist day of August, 1888, Eouis E. Williams, a Unite...
Document ID: 8504080, Name: GARSIDE v. NORVAL, Cleaned Text Snippet: KFATUFY, District Judge. On the 3d of October, 1888, the plaintiff filed a complaint in the office o...

Top results:
ID: 8503986, Name: In re JES

