## **Install Necessary Packages**

In [16]:
!pip install llama-index-vector-stores-moorcheh
!pip install llama-index-embeddings-openai
!pip install pandas
!pip install llama-index-readers-file
!pip install llama-index llama-index-llms-openai



In [17]:
# --- Import all necessary packages ---

import logging
import sys
import os
import csv
import pandas as pd
import time
from google.colab import userdata

from llama_index.vector_stores.moorcheh import MoorchehVectorStore
from IPython.display import Markdown, display
from typing import Any, Callable, Dict, List, Optional, cast
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    Settings,
)
from llama_index.core.base.embeddings.base_sparse import BaseSparseEmbedding
from llama_index.core.bridge.pydantic import PrivateAttr
from llama_index.core.schema import BaseNode, MetadataMode, TextNode
from llama_index.core.vector_stores.types import (
    BasePydanticVectorStore,
    MetadataFilters,
    VectorStoreQuery,
    VectorStoreQueryMode,
    VectorStoreQueryResult,
)
from llama_index.core.vector_stores.utils import (
    DEFAULT_TEXT_KEY,
    legacy_metadata_dict_to_node,
    metadata_dict_to_node,
    node_to_metadata_dict,
)
from llama_index.core.vector_stores.types import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
    FilterCondition,
)
from llama_index.llms.openai import OpenAI


## **Logging Setup**

In [18]:
# --- Logging Setup ---
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## **Initalize the MoorchehClient**

In [19]:
MOORCHEH_API_KEY = userdata.get("MOORCHEH_API_KEY")
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

llm = OpenAI(
    model="gpt-4o-mini",
    api_key=OPENAI_API_KEY
)

namespace_name="llamaindex_moorcheh"
documents_folder="./documents"
namespace_type="text" # or vector
query_csv_path = "queries.csv" # Path to your CSV file with queries
output_csv_path = "answers.csv" # Where to save the results
top_k = 5

## **Prepare and Chunk the Documents**

In [20]:
documents = SimpleDirectoryReader(documents_folder).load_data()

# --- Set chunk size and overlap ---
Settings.chunk_size = 1024
Settings.chunk_overlap = 20

## **Upload the Document Chunks**

In [21]:
# --- Initialize the Moorcheh Vector Store ---
__all__ = ["MoorchehVectorStore"]

vector_store = MoorchehVectorStore(
    api_key=MOORCHEH_API_KEY,
    namespace=namespace_name,
    namespace_type=namespace_type,
    vector_dimension=None,
    add_sparse_vector=False,
    batch_size=100,
)

# --- Create a Vector Store Index using the Vector Store and given Documents ---
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

[DEBUG] Initializing MoorchehClient
[DEBUG] Listing namespaces...
[DEBUG] Found namespaces: {'namespaces': [{'namespace_name': 'llamaindex_moorcheh1225', 'type': 'text', 'vector_dimension': None, 'createdAt': '2025-07-14T16:09:08.813Z', 'itemCount': 424}, {'namespace_name': 'local-benchmark-ns-1376', 'type': 'vector', 'vector_dimension': 1024, 'createdAt': '2025-06-23T19:22:45.867Z', 'itemCount': 20881}, {'namespace_name': 'local-benchmark-ns-2677', 'type': 'vector', 'vector_dimension': 1024, 'createdAt': '2025-06-30T16:29:48.517Z', 'itemCount': 31384}, {'namespace_name': 'local-benchmark-ns-3614', 'type': 'vector', 'vector_dimension': 1024, 'createdAt': '2025-06-23T19:34:57.338Z', 'itemCount': 10627}, {'namespace_name': 'local-benchmark-ns-3871', 'type': 'vector', 'vector_dimension': 1024, 'createdAt': '2025-06-23T18:44:37.030Z', 'itemCount': 6419}, {'namespace_name': 'local-benchmark-ns-4346', 'type': 'vector', 'vector_dimension': 1024, 'createdAt': '2025-06-23T20:55:25.526Z', 'itemC

## **Generate Answer**

In [22]:
# --- Set Logging to DEBUG for more Detailed Outputs ---

queries_df = pd.read_csv(query_csv_path)

with open(output_csv_path, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["passage_id", "query", "generated_answers"])
    writer.writeheader()

    for idx, q in enumerate(queries_df["query"]):
        print(f"Processing: {q}")
        try:
            response = vector_store.get_generative_answer(query = q, ai_model = "anthropic.claude-3-7-sonnet-20250219-v1:0", llm=None)
            time.sleep(0.5)
            writer.writerow({
                "passage_id": idx,
                "query": q,
                "generated_answers": response
            })
        except Exception as e:
            print(f"Error for query '{q}':", e)

Processing: What is Tuberculosis?
Processing: What is the Infection Rate of Tuberculosis?
Processing: What are some of the main causes of Tuberculosis?
Processing: In which region of the world is Tuberculosis the most prevalent?
Processing: What are some of the main demographics affected by Tuberculosis?
Processing: What are the latest advancements in KRAS-targeted therapies for pancreatic cancer?
Processing: Summarize outcomes from clinical trials comparing mFOLFIRINOX vs. gemcitabine-based regimens.
Processing: What role does tumor microenvironment play in pancreatic cancer treatment resistance?
Processing: What is the TNM stage of a 2.3 cm tumor in the head of the pancreas with involvement of 2 regional lymph nodes and no metastasis?
Processing: Which biomarkers are commonly elevated in early-stage pancreatic adenocarcinoma?
Processing: What are the imaging features that distinguish pancreatic cancer from autoimmune pancreatitis?
