In [1]:
import openai
import pickle
import faiss
import numpy as np
import os 

# Set your OpenAI API key
openai.api_key = "ENTER_YOUR_API_KEY"

# File paths
FAISS_INDEX_PATH = "C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/faiss_index.index"
ID_MAPPING_PATH = "C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/id_mapping.pkl"
CONTENT_DICT_PATH = "C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/content_dict.pkl"

def load_faiss_index(faiss_index_path):
    """Load the FAISS index from the given path."""
    return faiss.read_index(faiss_index_path)

def load_id_mapping(id_mapping_path):
    """Load the ID mapping from the given path."""
    with open(id_mapping_path, "rb") as f:
        return pickle.load(f)

def load_content_dict(content_dict_path):
    """Load the content dictionary from the given path."""
    with open(content_dict_path, "rb") as f:
        return pickle.load(f)

def generate_embedding_for_query(query):
    """Generate embedding for the user query using OpenAI API."""
    try:
        response = openai.Embedding.create(
            model="text-embedding-ada-002",
            input=query
        )
        return np.array(response["data"][0]["embedding"], dtype=np.float32)
    except Exception as e:
        print(f"Error generating embedding for query: {e}")
        return None

def perform_semantic_search(query_embedding, faiss_index, content_dict, id_mapping, top_k=5):
    """Perform semantic search using FAISS to retrieve the top-k similar embeddings and distances."""
    # Search for the nearest neighbors
    distances, indices = faiss_index.search(query_embedding.reshape(1, -1), top_k)
    results = []
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        if idx < len(id_mapping):  # Check if the index is valid
            result_id = id_mapping[idx] if isinstance(id_mapping, list) else id_mapping.get(idx, None)
            if result_id:
                # Retrieve corresponding file_name_chunk_no
                for item in content_dict:
                    if "id" in item and item["id"] == result_id:
                        results.append({
                            "file_name_chunk_no": item.get("text_chunk", "No file_name_chunk_no available"),
                            "distance": dist
                        })
                        break
    return results

def validate_content_dict(content_dict):
    """Validate that the content_dict contains readable strings."""
    for item in content_dict:
        assert isinstance(item.get("file_name_chunk_no", ""), str), f"Non-string file_name_chunk_no found: {item}"
    print("Content dictionary validation passed.")

def generate_response_with_rag(query, context):
    """Generate a response using RAG with OpenAI GPT."""
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions using the provided context."},
                {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"Error generating response: {e}")
        return "Unable to generate response."

if __name__ == "__main__":
    # Step 1: Load FAISS index, ID mapping, and content dictionary
    faiss_index = load_faiss_index(FAISS_INDEX_PATH)
    id_mapping = load_id_mapping(ID_MAPPING_PATH)
    content_dict = load_content_dict(CONTENT_DICT_PATH)

    # Step 2: Validate the content dictionary
    validate_content_dict(content_dict)

    # Step 3: Take user input
    user_query = input("Enter your question or query: ")

    # Step 4: Generate embedding for the user query
    query_embedding = generate_embedding_for_query(user_query)

    if query_embedding is not None:
        # Step 5: Perform semantic search
        results = perform_semantic_search(query_embedding, faiss_index, content_dict, id_mapping, top_k=5)


        # Step 6: Prepare the context from retrieved results
        context = "\n\n".join([result["file_name_chunk_no"] for result in results if result["file_name_chunk_no"]])


        # Step 7: Generate a response using RAG
        response = generate_response_with_rag(user_query, context)

        # Step 8: Display the generated response
        print("\nGenerated Response:\n")
        print(response)
    else:
        print("Failed to generate query embedding.")


Content dictionary validation passed.

Generated Response:

"Grounds v. Ralph" is a legal case that took place in 1875. The case was presented in the Arizona Supreme Court. The dispute involved money demand, where Ralph obtained a judgment for one hundred and fifty dollars in a lower court. Grounds appealed to the district court, where the judgment came in favor of Ralph again, but for sixty-two dollars and fifty cents only, along with costs. Grounds appealed again to the Arizona Supreme Court. The court discussed several legal principles and rules during the case, including grounds of objection, why they should be stated, and the appellate jurisdiction. The court concluded that the facts found by the court below are sufficient to sustain the judgment.



In [10]:
content_dict[0].keys()

dict_keys(['id', 'embedding'])

list

In [7]:
import os
# Define file paths
FAISS_INDEX_PATH = "C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/faiss_index.index"
ID_MAPPING_PATH = "C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/id_mapping.pkl"
CONTENT_DICT_PATH = "C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/output_embeddings.pkl"

# Check if files exist
def check_file_exists(file_path, file_description):
    if os.path.exists(file_path):
        print(f"{file_description} found: {file_path}")
    else:
        print(f"Error: {file_description} not found at: {file_path}")

# Check each file
check_file_exists(FAISS_INDEX_PATH, "FAISS index file")
check_file_exists(ID_MAPPING_PATH, "ID mapping file")
check_file_exists(CONTENT_DICT_PATH, "Content dictionary file")

FAISS index file found: C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/faiss_index.index
ID mapping file found: C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/id_mapping.pkl
Content dictionary file found: C:/Users/chiranjit usa/OneDrive - Northeastern University/Desktop/vector_database_project/output_embeddings.pkl
