# Save Case Embeddings to Pickle

In [1]:
folder_path = './data/extracted_texts'

In [2]:
import os
import pickle
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Function to load documents from text files.
def load_documents(folder_path):
    documents = {}
    # List only .txt files from the folder
    text_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    for text_file in tqdm(text_files, desc="Loading documents"):
        # Use the file name (without extension) as the case number
        case_number = os.path.splitext(text_file)[0]
        with open(os.path.join(folder_path, text_file), "r", encoding="utf-8") as f:
            documents[case_number] = f.read()
    return documents

# Define the folder where your documents are stored.
folder_path = os.path.join("./data", "extracted_texts")

# Load the documents.
documents = load_documents(folder_path)
print(f"Loaded {len(documents)} documents.")

# Initialize the embedding model.
embedding_model = SentenceTransformer("Stern5497/sbert-legal-xlm-roberta-base")

# Compute embeddings for all documents.
print("Computing case embeddings...")
case_embeddings = embedding_model.encode(list(documents.values()), show_progress_bar=True)

# Retrieve the corresponding case numbers.
case_numbers = list(documents.keys())

# Package the case numbers and embeddings into a dictionary.
embeddings_data = {
    "case_numbers": case_numbers,
    "embeddings": case_embeddings
}

# Specify the filename for the pickle file.
pickle_filename = "case_embeddings.pkl"

# Save the data to the pickle file.
with open(pickle_filename, "wb") as f:
    pickle.dump(embeddings_data, f)

print(f"Embeddings have been saved to {pickle_filename}")


  from .autonotebook import tqdm as notebook_tqdm
Loading documents: 100%|██████████| 248/248 [00:00<00:00, 1428.74it/s]


Loaded 248 documents.
Computing case embeddings...


Batches: 100%|██████████| 8/8 [02:02<00:00, 15.37s/it]

Embeddings have been saved to case_embeddings.pkl



