In [1]:
!pip install torch transformers llama-index scikit-learn numpy


Collecting llama-index
  Downloading llama_index-0.12.3-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.0-py3-none-any.whl.metadata (726 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_cli-0.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.3 (from llama-index)
  Downloading llama_index_core-0.12.3-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.3-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_index_legacy-0.9.48.post4-py3-none-any.whl.metadata (8.5 kB)
Collecting 

import os
import json

def load_json_files(folder_path):
    """Load all JSON files from a given folder into a list of dictionaries."""
    all_data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            with open(os.path.join(folder_path, file_name), 'r') as f:
                data = json.load(f)
                all_data.append(data)
    return all_data

# Example usage
folder_path = "json/"
data = load_json_files(folder_path)
print(f"Loaded {len(data)} files.")

import re

def preprocess_case_text(text):
    """Clean and standardize case text."""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters (keep alphanumeric and legal punctuations)
    text = re.sub(r'[^\w\s.,;:]', '', text)
    return text.strip()

def preprocess_data(data):
    """Preprocess data by cleaning text and standardizing metadata."""
    preprocessed_data = []
    for case in data:
        processed_case = {
            "id": case.get("id"),
            "name": case.get("name", "").strip(),
            "abbreviation": case.get("abbreviation", "").strip(),
            "decision_date": case.get("decision_date", "").strip(),
            "jurisdiction": case.get("jurisdiction_name", "").strip(),
            "cleaned_text": preprocess_case_text(case.get("case_text", "")),
        }
        preprocessed_data.append(processed_case)
    return preprocessed_data

# Example usage
preprocessed_data = preprocess_data(data)
print(f"Preprocessed {len(preprocessed_data)} cases.")



from transformers import BertTokenizer, BertModel
import torch
import numpy as np

class ColBERT:
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        self.model = BertModel.from_pretrained(pretrained_model_name)
        self.model.eval()

    def generate_embeddings(self, text):
        """Generate dense embeddings for a given text."""
        tokens = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**tokens)
            token_embeddings = outputs.last_hidden_state.squeeze(0)
            mask = tokens['attention_mask'].squeeze(0).bool()
            return token_embeddings[mask].numpy()

def generate_embeddings_for_cases(preprocessed_data, output_embedding_file, output_metadata_file):
    """Generate embeddings for all cases and save to file."""
    colbert = ColBERT()
    embeddings = []
    metadata = []

    for case in preprocessed_data:
        embeddings.append(colbert.generate_embeddings(case["cleaned_text"]))
        metadata.append({
            "id": case["id"],
            "name": case["name"],
            "abbreviation": case["abbreviation"],
            "decision_date": case["decision_date"],
            "jurisdiction": case["jurisdiction"],
            "cleaned_text": case["cleaned_text"],  # Include cleaned_text here
        })

    np.save(output_embedding_file, embeddings, allow_pickle=True)
    with open(output_metadata_file, 'w') as f:
        json.dump(metadata, f)

    print(f"Embeddings saved to {output_embedding_file}")
    print(f"Metadata saved to {output_metadata_file}")

# Generate embeddings
output_embedding_file = "data/embeddings.npy"
output_metadata_file = "data/metadata.json"
generate_embeddings_for_cases(preprocessed_data, output_embedding_file, output_metadata_file)


from sklearn.metrics.pairwise import cosine_similarity

def colbert_retrieve(query, embeddings_file, metadata_file, top_k=5):
    embeddings = np.load(embeddings_file, allow_pickle=True)
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)

    colbert = ColBERT()
    
    # Correct tokenization process
    query_tokens = colbert.tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        query_outputs = colbert.model(**query_tokens)
        query_embeddings = query_outputs.last_hidden_state.squeeze(0)
        mask = query_tokens['attention_mask'].squeeze(0).bool()
        query_embeddings = query_embeddings[mask].numpy()

    scores = []
    for doc_embeddings in embeddings:
        similarity_matrix = cosine_similarity(query_embeddings, doc_embeddings)
        max_similarities = similarity_matrix.max(axis=1)
        scores.append(max_similarities.sum())

    top_indices = np.argsort(scores)[-top_k:][::-1]
    results = [
        {
            "id": metadata[i]["id"],
            "name": metadata[i]["name"],
            "abbreviation": metadata[i]["abbreviation"],
            "decision_date": metadata[i]["decision_date"],
            "jurisdiction": metadata[i]["jurisdiction"],
            "score": scores[i],
        }
        for i in top_indices
    ]
    return results

# Example usage
query = "THE ALASKA GOLD MIN. CO. v. BARBRIDGE et al."
results = colbert_retrieve(query, "data/embeddings.npy", "data/metadata.json")
print("Top results:")
for res in results:
    print(res)


def query_system():
    print("Select a query type:")
    print("1. Search by Name")
    print("2. Search by Abbreviation")
    print("3. Search by Decision Date")
    print("4. Search by Jurisdiction")
    print("5. Custom Legal Query")
    
    choice = input("Enter choice (1-5): ")
    query = ""

    if choice == "1":
        query = input("Enter case name: ")
    elif choice == "2":
        query = input("Enter case abbreviation: ")
    elif choice == "3":
        query = input("Enter decision date (YYYY-MM-DD): ")
    elif choice == "4":
        query = input("Enter jurisdiction: ")
    elif choice == "5":
        query = input("Enter custom query: ")
    else:
        print("Invalid choice. Exiting.")
        return

    results = colbert_retrieve(query, "data/embeddings.npy", "data/metadata.json")
    print("Top results:")
    for res in results:
        print(f"ID: {res['id']}, Name: {res['name']}, Score: {res['score']:.4f}")

# Launch the query system
query_system()


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def generate_summary(query, retrieved_docs):
    """Generate an advanced summary using RAG."""
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

    # Use 'cleaned_text' if available; otherwise, fall back to 'name'
    context = " ".join([doc.get('cleaned_text', doc.get('name', '')) for doc in retrieved_docs])

    # Prepare input for the model
    input_text = f"Query: {query} Context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs.input_ids, max_length=200, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary



# Use the retrieved results for summarization
retrieved_docs = [res for res in results]  # Retrieve all document metadata
if retrieved_docs:
    print("\nGenerating advanced summary...\n")
    advanced_summary = generate_summary(query, retrieved_docs)
    print("Generated Summary:")
    print(advanced_summary)
else:
    print("No relevant documents found. Refine your query.")


print("Retrieved Document Example:")
print(retrieved_docs[0])  # Print a sample document to check keys


In [32]:
# json to csv

import os
import json
import pandas as pd
def json_to_csv(json_dir, output_csv):
    """
    Converts all JSON files in a directory into a single CSV file.

    Parameters:
        json_dir (str): Path to the directory containing JSON files.
        output_csv (str): Path to save the output CSV file.

    Returns:
        None
    """
    all_data = []

    # Iterate over all JSON files in the directory
    for file_name in os.listdir(json_dir):
        if file_name.endswith(".json"):
            file_path = os.path.join(json_dir, file_name)
            with open(file_path, "r") as file:
                # Load the JSON data
                data = json.load(file)
                
                # Flatten the JSON structure and extract relevant data
                row = {
                    "id": data.get("id"),
                    "name": data.get("name"),
                    "abbreviation": data.get("name_abbreviation"),
                    "decision_date": data.get("decision_date"),
                    "court_name": data.get("court", {}).get("name"),
                    "jurisdiction_name": data.get("jurisdiction", {}).get("name"),
                    "word_count": data.get("analysis", {}).get("word_count"),
                    "char_count": data.get("analysis", {}).get("char_count"),
                    "ocr_confidence": data.get("analysis", {}).get("ocr_confidence"),
                    "case_text": " ".join([opinion["text"] for opinion in data.get("casebody", {}).get("opinions", [])]),
                }
                all_data.append(row)
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)

    print(f"CSV file saved at: {output_csv}")

# Specify the path to the JSON directory and output CSV file
json_dir = "json/"
output_csv = "output_cases.csv"

# Convert JSON files to CSV
json_to_csv(json_dir, output_csv)


CSV file saved at: output_cases.csv


import os
import json
import re
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForSeq2SeqLM


# Data Loading
def load_json_files(folder_path):
    """Load all JSON files from a given folder into a list of dictionaries."""
    all_data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            with open(os.path.join(folder_path, file_name), 'r') as f:
                data = json.load(f)
                all_data.append(data)
    return all_data


# Data Preprocessing
def preprocess_case_text(text):
    """Clean and standardize case text."""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,;:]', '', text)
    return text.strip()


def preprocess_data(data):
    """Preprocess data by cleaning text and standardizing metadata."""
    preprocessed_data = []
    for case in data:
        processed_case = {
            "id": case.get("id"),
            "name": case.get("name", "").strip(),
            "abbreviation": case.get("abbreviation", "").strip(),
            "decision_date": case.get("decision_date", "").strip(),
            "jurisdiction": case.get("jurisdiction_name", "").strip(),
            "cleaned_text": preprocess_case_text(case.get("case_text", "")),
        }
        preprocessed_data.append(processed_case)
    return preprocessed_data


# ColBERT Class
class ColBERT:
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        self.model = BertModel.from_pretrained(pretrained_model_name)
        self.model.eval()

    def generate_embeddings(self, text):
        """Generate dense embeddings for a given text."""
        tokens = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**tokens)
            token_embeddings = outputs.last_hidden_state.squeeze(0)
            mask = tokens['attention_mask'].squeeze(0).bool()
            return token_embeddings[mask].numpy()


# Generate Embeddings for Cases
def generate_embeddings_for_cases(preprocessed_data, output_embedding_file, output_metadata_file):
    """Generate embeddings for all cases and save to file."""
    colbert = ColBERT()
    embeddings = []
    metadata = []

    for case in preprocessed_data:
        embeddings.append(colbert.generate_embeddings(case["cleaned_text"]))
        metadata.append({
            "id": case["id"],
            "name": case["name"],
            "abbreviation": case["abbreviation"],
            "decision_date": case["decision_date"],
            "jurisdiction": case["jurisdiction"],
            "cleaned_text": case["cleaned_text"],  # Include cleaned_text here
        })

    np.save(output_embedding_file, embeddings, allow_pickle=True)
    with open(output_metadata_file, 'w') as f:
        json.dump(metadata, f)

    print(f"Embeddings saved to {output_embedding_file}")
    print(f"Metadata saved to {output_metadata_file}")


# ColBERT Retrieval
def colbert_retrieve(query, embeddings_file, metadata_file, top_k=5):
    embeddings = np.load(embeddings_file, allow_pickle=True)
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)

    colbert = ColBERT()
    query_tokens = colbert.tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        query_outputs = colbert.model(**query_tokens)
        query_embeddings = query_outputs.last_hidden_state.squeeze(0)
        mask = query_tokens['attention_mask'].squeeze(0).bool()
        query_embeddings = query_embeddings[mask].numpy()

    scores = []
    for doc_embeddings in embeddings:
        similarity_matrix = cosine_similarity(query_embeddings, doc_embeddings)
        max_similarities = similarity_matrix.max(axis=1)
        scores.append(max_similarities.sum())

    top_indices = np.argsort(scores)[-top_k:][::-1]
    results = [
        {
            "id": metadata[i]["id"],
            "name": metadata[i]["name"],
            "abbreviation": metadata[i]["abbreviation"],
            "decision_date": metadata[i]["decision_date"],
            "jurisdiction": metadata[i]["jurisdiction"],
            "cleaned_text": metadata[i]["cleaned_text"],
            "score": scores[i],
        }
        for i in top_indices
    ]
    return results


# Summarization with RAG
def generate_summary(query, retrieved_docs):
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

    context = " ".join([doc.get('cleaned_text', '') for doc in retrieved_docs])
    input_text = f"Query: {query} Context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)

    summary_ids = model.generate(inputs.input_ids, max_length=200, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


# Query System
def query_system():
    print("Select a query type:")
    print("1. Search by Name")
    print("2. Search by Abbreviation")
    print("3. Search by Decision Date")
    print("4. Search by Jurisdiction")
    print("5. Custom Legal Query")

    choice = input("Enter choice (1-5): ")
    query = ""

    if choice == "1":
        query = input("Enter case name: ")
    elif choice == "2":
        query = input("Enter case abbreviation: ")
    elif choice == "3":
        query = input("Enter decision date (YYYY-MM-DD): ")
    elif choice == "4":
        query = input("Enter jurisdiction: ")
    elif choice == "5":
        query = input("Enter custom query: ")
    else:
        print("Invalid choice. Exiting.")
        return

    results = colbert_retrieve(query, "data/embeddings.npy", "data/metadata.json")
    print("Top results:")
    for res in results:
        print(f"ID: {res['id']}, Name: {res['name']}, Score: {res['score']:.4f}")

    if results:
        print("\nGenerating advanced summary...\n")
        advanced_summary = generate_summary(query, results)
        print("Generated Summary:")
        print(advanced_summary)
    else:
        print("No relevant documents found. Refine your query.")


# Main Execution
if __name__ == "__main__":
    folder_path = "json/"
    data = load_json_files(folder_path)
    preprocessed_data = preprocess_data(data)
    generate_embeddings_for_cases(preprocessed_data, "data/embeddings.npy", "data/metadata.json")
    query_system()


In [40]:
import os
import json
import re
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForSeq2SeqLM


# Data Loading
def load_json_files(folder_path):
    """Load all JSON files from a given folder into a list of dictionaries."""
    all_data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            with open(os.path.join(folder_path, file_name), 'r') as f:
                data = json.load(f)
                all_data.append(data)
    return all_data


# Extract and Clean Case Text
def extract_case_text(case):
    """Extract and clean the case text."""
    opinions = case.get("casebody", {}).get("opinions", [])
    if opinions:
        return re.sub(r'\s+', ' ', opinions[0].get("text", "").strip())  # Get first opinion's text
    return ""


# Data Preprocessing
def preprocess_data(data):
    """Preprocess data by cleaning text and standardizing metadata."""
    preprocessed_data = []
    for case in data:
        processed_case = {
            "id": case.get("id"),
            "name": case.get("name", "").strip(),
            "abbreviation": case.get("name_abbreviation", "").strip(),
            "decision_date": case.get("decision_date", "").strip(),
            "jurisdiction": case.get("jurisdiction", {}).get("name", "").strip(),
            "cleaned_text": extract_case_text(case),
        }
        preprocessed_data.append(processed_case)
    return preprocessed_data


In [41]:
# ColBERT Class
class ColBERT:
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        self.model = BertModel.from_pretrained(pretrained_model_name)
        self.model.eval()

    def generate_embeddings(self, text):
        """Generate dense embeddings for a given text."""
        tokens = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**tokens)
            token_embeddings = outputs.last_hidden_state.squeeze(0)
            mask = tokens['attention_mask'].squeeze(0).bool()
            return token_embeddings[mask].numpy()


# ColBERT Retrieval
def colbert_retrieve(query, embeddings_file, metadata_file, top_k=5):
    embeddings = np.load(embeddings_file, allow_pickle=True)
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)

    colbert = ColBERT()
    query_tokens = colbert.tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        query_outputs = colbert.model(**query_tokens)
        query_embeddings = query_outputs.last_hidden_state.squeeze(0)
        mask = query_tokens['attention_mask'].squeeze(0).bool()
        query_embeddings = query_embeddings[mask].numpy()

    scores = []
    for doc_embeddings in embeddings:
        similarity_matrix = cosine_similarity(query_embeddings, doc_embeddings)
        max_similarities = similarity_matrix.max(axis=1)
        scores.append(max_similarities.sum())

    top_indices = np.argsort(scores)[-top_k:][::-1]
    results = [
        {
            "id": metadata[i]["id"],
            "name": metadata[i]["name"],
            "abbreviation": metadata[i]["abbreviation"],
            "decision_date": metadata[i]["decision_date"],
            "jurisdiction": metadata[i]["jurisdiction"],
            "cleaned_text": metadata[i]["cleaned_text"],
            "score": scores[i],
        }
        for i in top_indices
    ]
    return results

In [42]:
# Summarization with RAG
def generate_summary(query, retrieved_docs):
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

    context = " ".join([doc.get('cleaned_text', '') for doc in retrieved_docs])
    input_text = f"Query: {query} Context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)

    summary_ids = model.generate(inputs.input_ids, max_length=200, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [43]:
# Query System
def query_system():
    print("Select a query type:")
    print("1. Search by Name")
    print("2. Search by Abbreviation")
    print("3. Search by Decision Date")
    print("4. Search by Jurisdiction")
    print("5. Custom Legal Query")

    choice = input("Enter choice (1-5): ")
    query = ""

    if choice == "1":
        query = input("Enter case name: ")
    elif choice == "2":
        query = input("Enter case abbreviation: ")
    elif choice == "3":
        query = input("Enter decision date (YYYY-MM-DD): ")
    elif choice == "4":
        query = input("Enter jurisdiction: ")
    elif choice == "5":
        query = input("Enter custom query: ")
    else:
        print("Invalid choice. Exiting.")
        return

    results = colbert_retrieve(query, "data/embeddings.npy", "data/metadata.json")
    print("Top results:")
    for res in results:
        print(f"ID: {res['id']}, Name: {res['name']}, Score: {res['score']:.4f}")

    if results:
        print("\nGenerating advanced summary...\n")
        advanced_summary = generate_summary(query, results)
        print("Generated Summary:")
        print(advanced_summary)
    else:
        print("No relevant documents found. Refine your query.")


# Main Execution
if __name__ == "__main__":
    folder_path = "json/"
    data = load_json_files(folder_path)
    preprocessed_data = preprocess_data(data)
    # Assuming embeddings and metadata are already generated
    query_system()

Select a query type:
1. Search by Name
2. Search by Abbreviation
3. Search by Decision Date
4. Search by Jurisdiction
5. Custom Legal Query


Enter choice (1-5):  5
Enter custom query:  What about GARSIDE v. NORVAL?


Top results:
ID: 8503986, Name: In re JESSE SCOTT OLIVER, Minor, Score: 3.6140
ID: 8506137, Name: UNITED STATES v. SHEEP CREEK JOHN, Score: 3.6140
ID: 8504379, Name: PRATT et al. v. UNITED ALASKA MIN. CO., Score: 3.6140
ID: 8504361, Name: SUTTER et al. v. HECKMAN et al., Score: 3.6140
ID: 8504334, Name: In re THOMPKINS McINTIRE ESTATE, Score: 3.6140

Generating advanced summary...





Generated Summary:
Query: What about GARSIDE v. NORVAL? Context:    v. GARSide v.NORVAL? context:  GARSIDE vs. NOR VAL. context: Garside vs. Norval. Context: NORVAL vs. Garside. context : 
