In [None]:

# %% [code]
import os
import pickle
import numpy as np
import concurrent.futures
from tqdm import tqdm

# Import your existing functions.
# If you have them in a separate module, you could do for example:
# from my_app_module import load_models, load_documents, find_most_similar_cases, safe_summarize_first_part_by_tokens

# Otherwise, include the functions inline.
from sentence_transformers import SentenceTransformer, util  # for cosine similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

# Define functions from your Streamlit app code (simplified for notebook use):

def load_models():
    # Loads the embedding and summarization models.
    embedding_model = SentenceTransformer("Stern5497/sbert-legal-xlm-roberta-base", device="cpu")

    tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
    legal_pegasus_model = AutoModelForSeq2SeqLM.from_pretrained(
        "nsi319/legal-pegasus",
        low_cpu_mem_usage=True,
        torch_dtype=torch.float32
    )
    legal_pegasus_model = legal_pegasus_model.to("cpu")
    summarizer = pipeline("summarization", model=legal_pegasus_model, tokenizer=tokenizer)

    print("Models loaded.")
    return embedding_model, summarizer

def load_documents(folder_path):
    """Load documents from individual text files in the specified folder."""
    text_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    documents = {}
    for text_file in tqdm(text_files, desc="Loading documents"):
        case_number = os.path.splitext(text_file)[0]
        file_path = os.path.join(folder_path, text_file)
        if not os.path.exists(file_path):
            print(f"Debug: File {file_path} does not exist!")
            continue
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read().strip()
            documents[case_number] = content
    if not documents:
        print("Debug: No documents were loaded! Check the folder path and file contents.")
    return documents

def find_most_similar_cases(query_text, embedding_model, case_embeddings, case_numbers, k=5):
    """Find the top k cases based on cosine similarity."""
    query_embedding = embedding_model.encode(query_text)
    similarities = util.cos_sim(query_embedding, case_embeddings)[0]
    top_k_indices = np.argsort(similarities.numpy())[::-1][:k]
    top_cases = [case_numbers[i] for i in top_k_indices]
    top_scores = [similarities.numpy()[i] for i in top_k_indices]
    return top_cases, top_scores

def safe_summarize_first_part_by_tokens(case_text, summarizer, max_tokens=1024):
    """
    Summarizes the first part of a legal case.
    Truncates the text so that the tokenizer does not exceed max_tokens.
    Returns the summary generated by the summarizer.
    """
    tokenizer = summarizer.tokenizer
    tokens = tokenizer.encode(case_text, add_special_tokens=True)
    original_token_count = len(tokens)
    special_tokens_count = tokenizer.num_special_tokens_to_add(pair=False)

    if original_token_count > max_tokens:
        allowed_tokens = max_tokens - special_tokens_count
        tokens = tokens[:allowed_tokens]
        truncated_text = tokenizer.decode(tokens, skip_special_tokens=True)
    else:
        truncated_text = case_text

    try:
        # Increase max_length and min_length for a fuller summary.
        summary_output = summarizer(truncated_text, max_length=1000, min_length=50, do_sample=False)
        if isinstance(summary_output, list) and summary_output:
            summary_text = summary_output[0].get("summary_text", "").strip()
            if summary_text:
                return summary_text
        return "[No summary generated.]"
    except Exception as e:
        return f"⚠️ Failed to summarize case: {str(e)}"

def get_case_url(case_number):
    """Generate URL for a case number (if needed)."""
    prefix = '-'.join(case_number.split('-')[:2])
    return f"https://law.justia.com/cases/federal/appellate-courts/cit/{prefix}/{case_number}.html"


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# %% [code]
# Set folder for text files (update the folder path as needed)
folder_path = "../data/extracted_texts"
documents = load_documents(folder_path)

# Remove empty documents
nonempty_documents = {k: v for k, v in documents.items() if v.strip() != ""}
print(f"Filtered: {len(nonempty_documents)} nonempty documents available.")

# Load models
embedding_model, summarizer = load_models()

# Prepare case numbers and texts.
case_numbers = list(nonempty_documents.keys())
case_texts = list(nonempty_documents.values())

# Load precomputed embeddings if available; otherwise, compute them.
pickle_filename = "../data/test_results/case_embeddings.pkl"
if os.path.exists(pickle_filename):
    with open(pickle_filename, "rb") as f:
        data = pickle.load(f)
    # Filter embeddings to only include nonempty documents.
    all_case_numbers = data["case_numbers"]
    all_case_embeddings = data["embeddings"]
    filtered_data = [(cn, emb) for cn, emb in zip(all_case_numbers, all_case_embeddings) if cn in nonempty_documents]
    if filtered_data:
        case_numbers, case_embeddings = zip(*filtered_data)
        case_numbers = list(case_numbers)
        case_embeddings = list(case_embeddings)
    else:
        raise ValueError("No embeddings found for nonempty documents!")
else:
    print("Computing case embeddings...")
    case_embeddings = embedding_model.encode(case_texts, show_progress_bar=True)
    case_numbers = list(nonempty_documents.keys())
    # Save embeddings for future use.
    with open(pickle_filename, "wb") as f:
        pickle.dump({"case_numbers": case_numbers, "embeddings": case_embeddings}, f)
    print("Embeddings computed and saved.")


Loading documents: 100%|██████████| 248/248 [00:00<00:00, 1566.17it/s]


Filtered: 245 nonempty documents available.


Device set to use cpu


Models loaded.


In [3]:
# %% [code]
# List of 5 test questions
test_questions = [
    "Which cases relate to international patent law?",
    "Which cases relate to customs enforcement and trademark violations?",
    "Which cases relate to legal arguments involving antidumping duties and domestic industry protection measures?",
    "Which cases relate to licensing disputes and cross-border intellectual property rights issues?",
    "Which cases relate to the court's approach to arbitration and mediation in resolving international commercial disputes?"
]

# Dictionary to store top 3 cases for each question
test_question_results = {}

# Dictionary to store summaries for each retrieved case. Key: case_number, Value: summary.
case_summaries = {}

# For each test question, get top 3 cases
for question in test_questions:
    top_cases, scores = find_most_similar_cases(question, embedding_model, case_embeddings, case_numbers, k=3)
    test_question_results[question] = {"cases": top_cases, "scores": scores}

# Now, for each of the 15 retrieved cases (some may be repeated), generate and store the summary.
# Use a set to avoid re-summarizing the same case.
cases_to_summarize = set()
for result in test_question_results.values():
    cases_to_summarize.update(result["cases"])

print(f"Total unique cases to summarize: {len(cases_to_summarize)}")

# Generate summaries in parallel for efficiency.
with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_case = {
        executor.submit(safe_summarize_first_part_by_tokens, nonempty_documents[case_num], summarizer, 1024): case_num
        for case_num in cases_to_summarize
    }
    for future in tqdm(concurrent.futures.as_completed(future_to_case), total=len(future_to_case), desc="Summarizing cases"):
        case_num = future_to_case[future]
        try:
            summary_text = future.result()
            case_summaries[case_num] = summary_text
        except Exception as exc:
            case_summaries[case_num] = f"⚠️ Failed to summarize: {exc}"

print("Summary generation complete.")


  a = torch.tensor(a)
Your max_length is set to 300, but your input_length is only 201. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=100)


Total unique cases to summarize: 11


Your max_length is set to 300, but your input_length is only 160. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=80)
Summarizing cases:   0%|          | 0/11 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4545 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4983 > 1024). Running this sequence through the model will result in indexing errors
Summarizing cases: 100%|██████████| 11/11 [2:43:11<00:00, 890.16s/it]   

Summary generation complete.





In [None]:
# %% [code]
# Save test_question_results to a pickle file.
with open("../data/test_results/test_results_similar_cases.pkl", "wb") as f:
    pickle.dump(test_question_results, f)
print("Test question results saved to 'test_question_results.pkl'.")

# Save case_summaries to a pickle file.
with open("../data/test_results/test_results_case_summaries.pkl", "wb") as f:
    pickle.dump(case_summaries, f)
print("Case summaries saved to 'case_summaries.pkl'.")


Test question results saved to 'test_question_results.pkl'.
Case summaries saved to 'case_summaries.pkl'.


{'Which cases relate to international patent law?': {'cases': ['11-00296-2023-09-20',
   '20-00110-2023-04-20',
   '21-00005-2023-09-06'],
  'scores': [0.7241118, 0.7166415, 0.702242]},
 'Which cases relate to customs enforcement and trademark violations?': {'cases': ['15-00124-2023-05-16',
   '21-00527-2023-09-14',
   '21-00361-2023-02-27'],
  'scores': [0.6722794, 0.65541726, 0.6290425]},
 'Which cases relate to legal arguments involving antidumping duties and domestic industry protection measures?': {'cases': ['21-00138-2023-06-07',
   '21-00532-2023-10-12',
   '21-00264-2023-03-16'],
  'scores': [0.7530558, 0.7288409, 0.72808427]},
 'Which cases relate to licensing disputes and cross-border intellectual property rights issues?': {'cases': ['21-00527-2023-09-14',
   '15-00124-2023-05-16',
   '20-00124-2023-02-27'],
  'scores': [0.71198833, 0.6892259, 0.68594384]},
 "Which cases relate to the court's approach to arbitration and mediation in resolving international commercial disputes

{'Which cases relate to international patent law?': {'cases': ['11-00296-2023-09-20',
   '20-00110-2023-04-20',
   '21-00005-2023-09-06'],
  'scores': [0.7241118, 0.7166415, 0.702242]},
 'Which cases relate to customs enforcement and trademark violations?': {'cases': ['15-00124-2023-05-16',
   '21-00527-2023-09-14',
   '21-00361-2023-02-27'],
  'scores': [0.6722794, 0.65541726, 0.6290425]},
 'Which cases relate to legal arguments involving antidumping duties and domestic industry protection measures?': {'cases': ['21-00138-2023-06-07',
   '21-00532-2023-10-12',
   '21-00264-2023-03-16'],
  'scores': [0.7530558, 0.7288409, 0.72808427]},
 'Which cases relate to licensing disputes and cross-border intellectual property rights issues?': {'cases': ['21-00527-2023-09-14',
   '15-00124-2023-05-16',
   '20-00124-2023-02-27'],
  'scores': [0.71198833, 0.6892259, 0.68594384]},
 "Which cases relate to the court's approach to arbitration and mediation in resolving international commercial disputes?": {'cases': ['21-00285-2023-12-22',
   '21-00138-2023-06-07',
   '20-00110-2023-04-20'],
  'scores': [0.74465966, 0.7436232, 0.7425722]}}

In [7]:
case_summaries

{'21-00285-2023-12-22': "On December 22, 2023, Judge Miller Baker of the U.S. District Court for the Southern District of New York granted the Department of Commerce's request for a redetermination of the antidumping duty case against Brooklyn BedDING, LLC. The court previously ordered the Department of Commerce to verify a Thai mat- 6 tress importer's data insofar as the Department relied upon that data. The court subsequently ordered the Department to verify a Thai mat- 6 tress importer's data in both the Commerce proceedings and this liti- 8gation. The porter subsequently withdrew from both the Commerce proceedings and this liti- 8gation. No remaining party opposes the remand redetermination, and the court 9 therefore sustains it. A separate judgment will enter. See USCIT R. 58(a)",
 '11-00296-2023-09-20': "On September 20, 2018, the U.S. District Court for the Southern District of New York granted the motion of Lincoln General Insurance Company, Inc., a New York-based insurance com

In [9]:
import json
import numpy as np

def convert_types(obj):
    if isinstance(obj, np.float32):
        return float(obj)
    if isinstance(obj, np.float64):
        return float(obj)
    if isinstance(obj, np.int32):
        return int(obj)
    if isinstance(obj, np.int64):
        return int(obj)
    raise TypeError(f"Type {type(obj)} not serializable")

# Now print clearly formatted JSON outputs
print("=== Top Cases for Each Test Question ===")
print(json.dumps(test_question_results, default=convert_types, indent=2))

print("\n=== Summaries for Retrieved Cases ===")
print(json.dumps(case_summaries, default=convert_types, indent=2))



=== Top Cases for Each Test Question ===
{
  "Which cases relate to international patent law?": {
    "cases": [
      "11-00296-2023-09-20",
      "20-00110-2023-04-20",
      "21-00005-2023-09-06"
    ],
    "scores": [
      0.724111795425415,
      0.7166414856910706,
      0.7022420167922974
    ]
  },
  "Which cases relate to customs enforcement and trademark violations?": {
    "cases": [
      "15-00124-2023-05-16",
      "21-00527-2023-09-14",
      "21-00361-2023-02-27"
    ],
    "scores": [
      0.672279417514801,
      0.655417263507843,
      0.6290425062179565
    ]
  },
  "Which cases relate to legal arguments involving antidumping duties and domestic industry protection measures?": {
    "cases": [
      "21-00138-2023-06-07",
      "21-00532-2023-10-12",
      "21-00264-2023-03-16"
    ],
    "scores": [
      0.7530558109283447,
      0.7288408875465393,
      0.7280842661857605
    ]
  },
  "Which cases relate to licensing disputes and cross-border intellectual pro

Also dump it into an excel 

In [15]:
test_question_results

{'Which cases relate to international patent law?': {'cases': ['11-00296-2023-09-20',
   '20-00110-2023-04-20',
   '21-00005-2023-09-06'],
  'scores': [0.7241118, 0.7166415, 0.702242]},
 'Which cases relate to customs enforcement and trademark violations?': {'cases': ['15-00124-2023-05-16',
   '21-00527-2023-09-14',
   '21-00361-2023-02-27'],
  'scores': [0.6722794, 0.65541726, 0.6290425]},
 'Which cases relate to legal arguments involving antidumping duties and domestic industry protection measures?': {'cases': ['21-00138-2023-06-07',
   '21-00532-2023-10-12',
   '21-00264-2023-03-16'],
  'scores': [0.7530558, 0.7288409, 0.72808427]},
 'Which cases relate to licensing disputes and cross-border intellectual property rights issues?': {'cases': ['21-00527-2023-09-14',
   '15-00124-2023-05-16',
   '20-00124-2023-02-27'],
  'scores': [0.71198833, 0.6892259, 0.68594384]},
 "Which cases relate to the court's approach to arbitration and mediation in resolving international commercial disputes

In [22]:
case_summaries

{'21-00285-2023-12-22': "On December 22, 2023, Judge Miller Baker of the U.S. District Court for the Southern District of New York granted the Department of Commerce's request for a redetermination of the antidumping duty case against Brooklyn BedDING, LLC. The court previously ordered the Department of Commerce to verify a Thai mat- 6 tress importer's data insofar as the Department relied upon that data. The court subsequently ordered the Department to verify a Thai mat- 6 tress importer's data in both the Commerce proceedings and this liti- 8gation. The porter subsequently withdrew from both the Commerce proceedings and this liti- 8gation. No remaining party opposes the remand redetermination, and the court 9 therefore sustains it. A separate judgment will enter. See USCIT R. 58(a)",
 '11-00296-2023-09-20': "On September 20, 2018, the U.S. District Court for the Southern District of New York granted the motion of Lincoln General Insurance Company, Inc., a New York-based insurance com

In [None]:
import pandas as pd

queries = []
case_numbers_list = []

# Iterate over test questions and their top retrieved cases
for query, cases in test_question_results.items():
    case_numbers = cases["cases"]  # Retrieve top 3 case numbers per query
    
    for case_number in case_numbers:
        queries.append(query)  # Repeat the same question for each case
        case_numbers_list.append(case_number)  # Append each individual case number

# Create DataFrame clearly structured
df = pd.DataFrame({
    "Query": queries,
    "Case Number": case_numbers_list
})

# Save DataFrame to Excel file
df.to_excel("../data/test_results/test_results_similar_cases.xlsx", index=False)
# Save DataFrame to Excel file
df.to_excel("../data/test_results/test_results_similar_cases.xlsx", index=False)


Excel file 'test_questions_cases.xlsx' successfully created.
Excel file '../data/test_results_similar_cases.xlsx' successfully created.


In [38]:
case_summaries_df = pd.DataFrame(list(case_summaries.items()), columns=["Case Number", "Summary"])

In [39]:
case_summaries_df

Unnamed: 0,Case Number,Summary
0,21-00285-2023-12-22,"On December 22, 2023, Judge Miller Baker of th..."
1,11-00296-2023-09-20,"On September 20, 2018, the U.S. District Court..."
2,21-00532-2023-10-12,"On October 12, 2023, the U.S. District Court f..."
3,21-00361-2023-02-27,"On February 27, 2023, the U.S. District Court ..."
4,21-00138-2023-06-07,The U.S. District Court for the District of Co...
5,21-00264-2023-03-16,"On March 16, 2023, the U.S. District Court for..."
6,21-00527-2023-09-14,The U.S. Attorney's Office for the Southern Di...
7,15-00124-2023-05-16,The U.S. Attorney's Office for the Southern Di...
8,21-00005-2023-09-06,"On September 6, 2023, the U.S. District Court ..."
9,20-00110-2023-04-20,"On April 20, 2023, the U.S. District Court for..."


In [None]:
import pandas as pd

queries = []
case_numbers_list = []

# Iterate over test questions and their top retrieved cases
for query, cases in test_question_results.items():
    case_numbers = cases["cases"]
    
    for case_number in case_numbers:
        queries.append(query)
        case_numbers_list.append(case_number)

# Create initial dataframe from questions and case numbers
df = pd.DataFrame({
    "Query": queries,
    "Case Number": case_numbers_list
})

# Merge the dataframe with the summaries dataframe on 'Case Number'
final_df = df.merge(case_summaries_df, on="Case Number", how="left")

# Save to Excel
final_df.to_excel("../data/test_results/test_questions_cases_with_summaries.xlsx", index=False)

print("Excel file 'test_questions_cases_with_summaries.xlsx' successfully created.")


Excel file 'test_questions_cases_with_summaries.xlsx' successfully created.
