In [None]:
#install oracle ads 
!pip install oracle-ads

In [None]:
import ads
from ads.dataset.factory import DatasetFactory
from ads.evaluations.evaluator import ADSEvaluator  
from ads.common.data import ADSData
from ads.common.model_artifact import ModelArtifact

In [None]:
!pip install oracle-automlx

In [None]:
import automlx as automl
from automlx import Pipeline
automl.init(engine='local')

In [None]:
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])

In [None]:
# Put your compartment id
compartment_id = s.environ["NB_SESSION_COMPARTMENT_OCID"]
# OCI 
genai_endpoint = "https://inference.generativeai.us-ashburn-1.oci.oraclecloud.com"
# model_id for embedding 
genai_embedding_model ="cohere.embed-english-v3.0"
# model_id for generation
oci_model = "cohere.command-r-plus" 

# pre provsioned opensearch_url ( do not change)
opensearch_url="https://amaaaaaaedf3kyya4llgnrumbw6newbj5ihvy4cg64swrbiq3rgpufkuao4q.opensearch.us-ashburn-1.oci.oraclecloud.com:9200"
# Setup OpenSearch Username & Password: these are only valid during the live-lab.  
username="adminos"
password="Asp12345$"
#will change later with diffrent dataset
index_name = "train-inx"
auth = (username, password)
BULK_LIMIT=10
AUTH_TYPE="RESOURCE_PRINCIPAL"
#will change later with diffrent dataset
file_path = 'data/medical-reasoning'

# Setup Resource Principal for authentication
auth_provider = oci.auth.signers.get_resource_principals_signer()
MAX_DOCUMENTS = 1000
# Set up Oracle ADS for authentication
ads.set_auth("resource_principal")

In [None]:
#using huggingface embedding model (facing issues accessing OCI emedding model)
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

In [None]:
from langchain.vectorstores import OpenSearchVectorSearch
# Initialize OpenSearch as the vector database
vector_db = OpenSearchVectorSearch(opensearch_url=opensearch_url, 
                            index_name=index_name, 
                            embedding_function=embeddings,
                            signer=auth_provider,
                            auth_type=AUTH_TYPE,
                            http_auth=auth)

In [None]:
# Updated function to process medical reasoning dataset
def process_medical_reasoning_data(file_path):
    """
    Process medical reasoning data from JSONL files in the specified directory
    Each line contains: Question, Complex_CoT, and Response fields
    """
    import os
    import json
    from langchain.schema import Document
    
    documents = []
    cnt = 0
    
    # Check if file_path is a directory or file
    if os.path.isdir(file_path):
        # Process all JSONL files in the directory
        for filename in os.listdir(file_path):
            if filename.endswith('.jsonl'):
                file_full_path = os.path.join(file_path, filename)
                print(f"Processing file: {filename}")
                cnt = process_jsonl_file(file_full_path, documents, cnt)
                if MAX_DOCUMENTS > 0 and cnt >= MAX_DOCUMENTS:
                    break
    else:
        # Process single file
        if file_path.endswith('.jsonl'):
            print(f"Processing single file: {file_path}")
            cnt = process_jsonl_file(file_path, documents, cnt)
    
    print(f"Total documents processed: {cnt}")
    return documents

def process_jsonl_file(file_path, documents, cnt):
    """
    Process a single JSONL file and extract medical reasoning data
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line_num, line in enumerate(file, 1):
                try:
                    # Parse each line as JSON
                    data = json.loads(line.strip())
                    
                    # Extract question, reasoning, and response
                    question = data.get('Question', '')
                    complex_cot = data.get('Complex_CoT', '')
                    response = data.get('Response', '')
                    
                    # Create comprehensive document content
                    content_parts = []
                    if question:
                        content_parts.append(f"Question: {question}")
                    if complex_cot:
                        content_parts.append(f"Reasoning: {complex_cot}")
                    if response:
                        content_parts.append(f"Answer: {response}")
                    
                    # Combine all parts
                    full_content = "\n\n".join(content_parts)
                    
                    if full_content.strip():  # Only add non-empty documents
                        documents.append(Document(
                            page_content=full_content,
                            metadata={
                                "source": file_path,
                                "line_number": line_num,
                                "question": question[:100] + "..." if len(question) > 100 else question
                            }
                        ))
                        cnt += 1
                        
                        if MAX_DOCUMENTS > 0 and cnt >= MAX_DOCUMENTS:
                            return cnt
                            
                except json.JSONDecodeError as e:
                    print(f"Error parsing line {line_num} in {file_path}: {e}")
                    continue
                    
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    
    return cnt

In [None]:
MAX_DOCUMENTS = 100 # Override the limit to ingest only first 100 just for demo. Comment this line out or set to negative number to ingest all documents.

# Process medical reasoning documents 
documents = process_medical_reasoning_data(file_path)
print(f"\nValidate Processed documents by printing a few random documents:")
if len(documents) > 0:
    print(f"\nDOCUMENT 1:\n{documents[0].page_content[:500]}...")
    if len(documents) > 50:
        print(f"\nDOCUMENT 51:\n{documents[50].page_content[:500]}...")
    if len(documents) > 99:
        print(f"\nDOCUMENT 100:\n{documents[99].page_content[:500]}...")

print(f"\nTotal Number of Documents to ingest: {len(documents)}")
print(f"Document structure - each contains Question, Reasoning, and Answer sections from medical cases")

In [None]:
# Check the index mapping
response = vector_db.client.indices.get_mapping(index=index_name)
print("Index Mapping:", response)

In [None]:
import numpy as np

# Function to perform a semantic search using vector embeddings
def retrieve_documents_with_embeddings(query, top_k=5):
    # Generate the embedding for the query using your embedding function
    query_embedding = vector_db.embedding_function.embed_query(query)
    
    # Ensure the embedding is in the correct format (e.g., a list of floats)
    query_embedding = np.array(query_embedding).tolist()

    # Perform a knn search in OpenSearch
    search_results = vector_db.client.search(
        index=vector_db.index_name,
        body={
            "size": top_k,
            "query": {
                "knn": {
                    "vector_field": {  # Use the correct field name for embeddings
                        "vector": query_embedding,
                        "k": top_k
                    }
                }
            }
        }
    )

    documents_with_embeddings = []
    for hit in search_results['hits']['hits']:
        doc_content = hit['_source']['text']  # Adjust to the correct field name for document text
        embedding = hit['_source'].get('vector_field')  # Retrieve the embedding if needed
        documents_with_embeddings.append((doc_content, embedding))

    return documents_with_embeddings

In [None]:
# Example usage
query = "Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?"
documents_with_embeddings = retrieve_documents_with_embeddings(query,2)

# Print the documents and their embeddings
print(f"Top {len(documents_with_embeddings)} documents and their embeddings for the query: \"{query}\"")
for idx, (content, embedding) in enumerate(documents_with_embeddings):
    print(f"\nDocument {idx + 1}:")
    print(f"Content: {content}\n")

In [None]:
# Semantic Search Test Function
def semantic_search_test(query, top_k=5):
    # Perform a semantic search
    search_results = vector_db.similarity_search(query, k=top_k)
    
    # Display the top-k retrieved documents
    print(f"Top {top_k} results for the query: \"{query}\"")
    for idx, result in enumerate(search_results):
        print(f"\nResult {idx + 1}:")
        print(f"Document: {result.page_content}\n")

# Run a semantic search test
semantic_search_test("Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?", top_k=5)