In [19]:
pip install openai==0.28.0





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
import openai

# Set your OpenAI API key
openai.api_key = "ENTER_YOUR_API_KEY"


In [21]:
import os

In [22]:
def fetch_json_directories(base_dir):
    """
    Recursively fetch directories containing .json files in the given directory.

    Args:
        base_dir (str): The base directory to search within.

    Returns:
        list: A list of unique directories containing .json files.
    """
    json_directories = set()  # Use a set to ensure uniqueness
    
    # Walk through all directories and subdirectories
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".json"):  # Check for .json extension
                json_directories.add(root)  # Add the directory to the set
                break  # No need to check other files in this directory
    
    return list(json_directories)

In [23]:
legal_case_paths = fetch_json_directories("C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/static.case.law/")

In [24]:
len(legal_case_paths)

4

In [25]:
legal_case_paths

['C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/static.case.law/ariz-app\\1.zip_extracted\\metadata',
 'C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/static.case.law/ariz\\1.zip_extracted\\json',
 'C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/static.case.law/ariz-app\\1.zip_extracted\\json',
 'C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/static.case.law/ariz\\1.zip_extracted\\metadata']

In [26]:
import faiss
import numpy as np
import openai
import json
import pickle
from pathlib import Path
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

openai.api_key = "ENTER_YOUR_API_KEY"

def split_into_chunks(text, chunk_size=1000, chunk_overlap=200):
    """
    Splits the text into chunks of specified size with overlap.

    Parameters:
    - text: The input text to be split.
    - chunk_size: The maximum size of each chunk (default: 1000).
    - chunk_overlap: The overlap size between consecutive chunks (default: 200).

    Returns:
    - List of text chunks.
    """
    chunks = []
    for i in range(0, len(text), chunk_size - chunk_overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

def load_json_files(directory):
    """
    Loads and extracts content from JSON files in the specified directory.

    Parameters:
    - directory: Path to the directory containing JSON files.

    Returns:
    - List of dictionaries with 'id' and 'content' for each JSON.
    """
    data = []
    directory = Path(directory)
    for file in tqdm(directory.rglob("*.json"), desc="Loading JSON files"):
        try:
            with open(file, "r", encoding="utf-8") as f:
                json_data = json.load(f)
                if isinstance(json_data, dict):
                    content = json.dumps(json_data)  # Convert dict to string
                elif isinstance(json_data, list):
                    content = "\n".join([json.dumps(item) for item in json_data])
                else:
                    content = str(json_data)
            data.append({
                "id": f"{os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(file))))}_{os.path.basename(os.path.dirname(os.path.dirname(file)))}_{os.path.basename(os.path.dirname(file))}_{file.name}",
                "content": content
            })
        except Exception as e:
            print(f"Error loading file {file.name}: {e}")
            continue
    return data

def process_json_files(json_files):
    """
    Processes JSON files to extract content and split it into chunks.

    Parameters:
    - json_files: List of dictionaries with 'id' and 'content' for each JSON.

    Returns:
    - List of dictionaries containing 'id' and 'text_chunk'.
    """
    all_chunks = []
    for json_file in tqdm(json_files, desc="Processing JSON files"):
        # Split JSON content into chunks
        text_chunks = split_into_chunks(json_file["content"])
        # Append text chunks with their corresponding IDs
        for i, chunk in enumerate(text_chunks):
            all_chunks.append({
                "id": f"{json_file['id']}_chunk_{i}",
                "text_chunk": chunk
            })
    return all_chunks

def generate_embeddings_from_chunks(chunks, batch_size=2):
    """
    Generates embeddings for the given text chunks using OpenAI's Embedding API.

    Parameters:
    - chunks: List of dictionaries with 'id' and 'text_chunk'.
    - batch_size: Number of chunks to process in a single API call (default: 2).

    Returns:
    - List of embeddings with their corresponding 'id'.
    """
    embeddings = []
    for i in tqdm(range(0, len(chunks), batch_size), desc="Generating embeddings"):
        batch = chunks[i:i + batch_size]
        try:
            # Extract text content for embedding generation
            batch_texts = [item["text_chunk"] for item in batch]
            response = openai.Embedding.create(
                model="text-embedding-ada-002",
                input=batch_texts
            )
            # Add embeddings with corresponding IDs
            for j, embedding in enumerate(response["data"]):
                embeddings.append({
                    "id": batch[j]["id"],
                    "embedding": embedding["embedding"]
                })
        except Exception as e:
            print(f"Error generating embedding for batch {i}: {e}")
            continue
    return embeddings

def save_embeddings(embeddings, file_name="embeddings.pkl"):
    """
    Saves embeddings to a pickle file.

    Parameters:
    - embeddings: List of embeddings to save.
    - file_name: Name of the file to save the embeddings (default: 'embeddings.pkl').
    """
    try:
        with open(file_name, "wb") as f:
            pickle.dump(embeddings, f)
        print(f"Embeddings saved to {file_name}")
    except Exception as e:
        print(f"Error saving embeddings: {e}")

def store_embeddings_in_faiss(embeddings, faiss_index_file="faiss_index.index", d=1536):
    """
    Stores embeddings in a FAISS vector database.

    Parameters:
    - embeddings: List of dictionaries containing 'id' and 'embedding'.
    - faiss_index_file: File name to save the FAISS index (default: 'faiss_index.index').
    - d: Dimension of the embeddings (default: 1536 for OpenAI embeddings).
    """
    # Initialize a FAISS index (L2 distance for default)
    index = faiss.IndexFlatL2(d)

    # Metadata mapping for IDs
    id_mapping = []

    # Convert embeddings into a NumPy array
    vectors = []
    for item in embeddings:
        id_mapping.append(item["id"])
        vectors.append(np.array(item["embedding"], dtype=np.float32))

    vectors = np.vstack(vectors)  # Stack vectors into a single NumPy array

    # Add vectors to the FAISS index
    index.add(vectors)

    # Save the FAISS index
    faiss.write_index(index, faiss_index_file)

    # Save the ID mapping
    with open("id_mapping.pkl", "wb") as f:
        pickle.dump(id_mapping, f)

    print(f"Stored {len(embeddings)} embeddings into the FAISS index.")

if __name__ == "__main__":
    # Directory containing JSON files
    json_dir = "C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/static.case.law/"  # Replace with your directory path

    # Load JSON files
    json_files=[]
    for folder in legal_case_paths:
        folder_files = load_json_files(folder)
        json_files.extend(folder_files)
    print(len(json_files))
    # Process JSON files to generate text chunks (content_dict)
    try:
        content_dict = process_json_files(json_files)
        print(f"Processed {len(content_dict)} text chunks.")
    except Exception as e:
        print(f"Error during processing: {e}")

    # Save content_dict to a file for later use
    try:
        with open("content_dict.pkl", "wb") as f:
            pickle.dump(content_dict, f)
        print("Saved content_dict with text chunks.")
    except Exception as e:
        print(f"Error saving content_dict: {e}")

    # Generate embeddings from text chunks
    try:
        embeddings = generate_embeddings_from_chunks(content_dict, batch_size=2)
        print(f"Generated embeddings for {len(embeddings)} chunks.")
    except Exception as e:
        print(f"Error generating embeddings: {e}")

    # Save embeddings to a file
    save_embeddings(embeddings, file_name="output_embeddings.pkl")

    # Store embeddings in FAISS
    try:
        store_embeddings_in_faiss(embeddings, faiss_index_file="faiss_index.index", d=1536)
        print("Embeddings successfully stored in FAISS.")
    except Exception as e:
        print(f"Error storing embeddings in FAISS: {e}")


Loading JSON files: 2it [00:00, 77.72it/s]
Loading JSON files: 40it [00:00, 607.55it/s]
Loading JSON files: 50it [00:00, 1839.16it/s]
Loading JSON files: 2it [00:00, 305.91it/s]


94


Processing JSON files: 100%|██████████| 94/94 [00:00<00:00, 19487.18it/s]


Processed 2875 text chunks.
Saved content_dict with text chunks.


Generating embeddings: 100%|██████████| 1438/1438 [07:01<00:00,  3.41it/s]


Generated embeddings for 2875 chunks.
Embeddings saved to output_embeddings.pkl
Stored 2875 embeddings into the FAISS index.
Embeddings successfully stored in FAISS.


In [27]:
content_dict

[{'id': 'ariz-app_1.zip_extracted_metadata_CasesMetadata.json_chunk_0',
  'text_chunk': '{"id": 1234473, "name": "Iva L. ROREBECK, Sebe L. Broyles and Clarice L. Broyles, his wife, Appellants, v. Helen CRISTE, the wife of John Criste, Appellee", "name_abbreviation": "Rorebeck v. Criste", "decision_date": "1965-02-02", "docket_number": "No. 1 CA-CIV 3", "first_page": "1", "last_page": "6", "citations": [{"type": "official", "cite": "1 Ariz. App. 1"}, {"type": "parallel", "cite": "398 P.2d 678"}], "court": {"name_abbreviation": "Ariz. Ct. App.", "id": 8807, "name": "Arizona Court of Appeals"}, "jurisdiction": {"id": 21, "name_long": "Arizona", "name": "Ariz."}, "cites_to": [{"cite": "80 A.L.R.2d 1161", "category": "reporters:specialty", "reporter": "A.L.R. 2d", "year": 1960, "opinion_index": 0}, {"cite": "220 Or. 297", "category": "reporters:state", "reporter": "Or.", "case_ids": [5778884], "weight": 2, "year": 1960, "opinion_index": 0, "case_paths": ["/or/220/0297-01"]}, {"cite": "9 P.2