In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [59]:
import numpy as np
import minsearch
import json
from tqdm.auto import tqdm

import pandas as pd
from langchain_core.prompts import PromptTemplate,ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List

from langchain.output_parsers import PydanticOutputParser,RetryOutputParser
from langchain_core.exceptions import OutputParserException

from langchain_community.llms import Ollama

from datasets import load_dataset
from collections import defaultdict
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import re

import os
from dotenv import load_dotenv
import openai

load_dotenv()

True

In [2]:
QUERY_GENERATED_FILE = '../data/queries_generated.parquet'
GEMMA2_QUERY_FILE = "../data/gemma2_queries.parquet"
LLAMA3_QUERY_FILE = "../data/llama3_queries.parquet"
QUERY_EVALUATED_FILE = '../data/queries_evaluated.parquet'
RAG_EVALUDATION_RESULT_FILE = '../data/Evaluation_results_2.csv'

## Ingestion

In [3]:
# Read the DataFrame from a Parquet file
df_org = pd.read_parquet(QUERY_EVALUATED_FILE)

# Create a new column 'true_answer' by concatenating the first two answers from the 'answers' list
df_org['true_answer'] = df_org['answers'].apply(
    lambda answer_list: ' '.join([answer['answer'] for answer in answer_list[:2] if 'answer' in answer]))

def prepare_df(df_org):
    # Create a DataFrame for ground truth queries for Gemma
    # Explode the 'gemma_queries' column to create a separate row for each query
    ground_truth_gemma_df = df_org[['question_id', 'gemma_queries']]
    ground_truth_gemma_df = ground_truth_gemma_df.explode(['gemma_queries'])
    
    # Create a DataFrame for ground truth queries for Llama
    ground_truth_llama_df = df_org[['question_id', 'llama3_queries']]
    ground_truth_llama_df = ground_truth_llama_df.explode(['llama3_queries'])
    
    # Create a DataFrame with relevant question details and the true answer
    quest_ans_df = df_org[['question_id', 'question_title', 'question', 'true_answer']]
    
    # Drop unnecessary columns from the original DataFrame
    df = df_org.drop(['gemma_queries', 'gemma_queries_llama3_feedback', 'gemma_queries_llama3_score',
                      'llama3_queries', 'llama3_queries_gemma_score', 'llama3_queries_gemma_feedback',
                      'true_answer'], axis=1)

    # Assign the 'answers' column to a new DataFrame and explode it to create separate rows for each answer
    df_exploded = df.assign(answers=df['answers']).explode('answers')

    # Create an 'answer_index' column to track the position of each answer for filtering
    df_exploded['answer_index'] = df_exploded.groupby('question_id').cumcount()
    # Filter to keep only the first 2 answers for each question and drop the 'answer_index' column
    df_filtered = df_exploded[df_exploded['answer_index'] < 2].drop('answer_index', axis=1)

    # Expand the answers dictionary into separate columns, creating new columns for each key in the answer dictionary
    df_expanded = df_filtered['answers'].apply(pd.Series)

    # Concatenate the original DataFrame (without 'answers') with the expanded answers
    raw_df = pd.concat([df_filtered.drop('answers', axis=1), df_expanded], axis=1)
    
    return raw_df, quest_ans_df, ground_truth_gemma_df, ground_truth_llama_df

df,quest_ans_df, ground_truth_gemma_df, ground_truth_llama_df, = prepare_df(df_org)
df.to_csv("../services/app/Mental_wellness_data.csv",index=False)
documents = df.to_dict(orient='records')
documents[0]

{'question_id': 0,
 'question_title': 'Do I have too many issues for counseling',
 'question': 'I have so many issues to address. I have a history of sexual abuse, I‚Äôm a breast cancer survivor and I am a lifetime insomniac. I have a long history of depression and I‚Äôm beginning to have anxiety. I have low self esteem but I‚Äôve been happily married for almost 35 years. I‚Äôve never had counseling about any of this. Do I have too many issues to address in counseling',
 'question_link': 'https://counselchat.com/questions/do-i-have-too-many-issues-for-counseling',
 'topic': 'depression',
 'answer': 'It is very common for people to have multiple issues that they want to (and need to) address in counseling. I have had clients ask that same question and through more exploration, there is often an underlying fear that they "can\'t be helped" or that they will "be too much for their therapist." I don\'t know if any of this rings true for you. But, most people have more than one problem in t

## Azure Content Safety

In [4]:
#https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/contentsafety/azure-ai-contentsafety/samples/sample_analyze_text.py
from azure.ai.contentsafety import ContentSafetyClient
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import HttpResponseError
from azure.ai.contentsafety.models import TextCategory
from azure.ai.contentsafety.models import AnalyzeTextOptions

class ContentSafetyManager:
    def __init__(self):
        self.client = ContentSafetyClient(
            endpoint=os.getenv("AZURE_CONTENT_SAFETY_ENDPOINT"),
            credential=AzureKeyCredential(os.getenv("AZURE_CONTENT_SAFETY_KEY"))
        )
    
    def safety_check(self, text, context="unknown"):
        """Check text for safety violations"""
        try:
            request = AnalyzeTextOptions(text=text)
            response = self.client.analyze_text(request)
            
            hate_result = next(item for item in response.categories_analysis if item.category == TextCategory.HATE)
            self_harm_result = next(item for item in response.categories_analysis if item.category == TextCategory.SELF_HARM)
            sexual_result = next(item for item in response.categories_analysis if item.category == TextCategory.SEXUAL)
            violence_result = next(item for item in response.categories_analysis if item.category == TextCategory.VIOLENCE)

            if hate_result:
                print(f"Hate severity: {hate_result.severity}")
            if self_harm_result:
                print(f"SelfHarm severity: {self_harm_result.severity}")
            if sexual_result:
                print(f"Sexual severity: {sexual_result.severity}")
            if violence_result:
                print(f"Violence severity: {violence_result.severity}")
        
            # Check for any harmful content
            for category in response.categories_analysis:
                if category.severity > 0:
                    print(f"!!! Safety violation in {context}: {category.category} (severity {category.severity})")
                    return False, category.category
            
            return True, "safe"
            
        except Exception as e:
            print(f"Content safety check failed for {context}: {e}")
            return False, "check_failed"

In [49]:
safety_manager = ContentSafetyManager()
user_query = "Tell me something harmful"
is_safe, reason = safety_manager.safety_check(user_query, "user_input")
print(is_safe, reason)

print()
safety_manager = ContentSafetyManager()
user_query = "Tell me how to cope with depression"
is_safe, reason = safety_manager.safety_check(user_query, "user_input")
print(is_safe, reason)

Hate severity: 0
SelfHarm severity: 2
Sexual severity: 0
Violence severity: 0
!!! Safety violation in user_input: SelfHarm (severity 2)
False SelfHarm

Hate severity: 0
SelfHarm severity: 0
Sexual severity: 0
Violence severity: 0
True safe


In [12]:
!pip install azureml-core azureml azureml-mlflow

Collecting azureml-mlflow
  Downloading azureml_mlflow-1.60.0.post1-py3-none-any.whl.metadata (2.8 kB)
Collecting mlflow-skinny<3.0.0 (from azureml-mlflow)
  Downloading mlflow_skinny-2.22.2-py3-none-any.whl.metadata (31 kB)
Collecting azure-storage-blob<=12.19.0,>=12.5.0 (from azureml-mlflow)
  Downloading azure_storage_blob-12.19.0-py3-none-any.whl.metadata (26 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny<3.0.0->azureml-mlflow)
  Using cached cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Downloading azureml_mlflow-1.60.0.post1-py3-none-any.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   -----

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
azure-storage-file-datalake 12.22.0 requires azure-storage-blob>=12.27.0, but you have azure-storage-blob 12.19.0 which is incompatible.
mlflow 3.6.0 requires mlflow-skinny==3.6.0, but you have mlflow-skinny 2.22.2 which is incompatible.


In [5]:
### Setup MLFlow using Azure MLWorkspace for experiment tracking
import mlflow
import os
from azure.identity import ClientSecretCredential
from azure.ai.ml import MLClient
from azureml.core import Workspace

def setup_azure_ml_mlflow():
    """Setup MLflow with Azure ML workspace using Service Principal"""
    
    # Set Service Principal credentials
    tenant_id = os.getenv("AZURE_TENANT_ID")
    client_id = os.getenv("AZURE_CLIENT_ID") 
    client_secret = os.getenv("AZURE_CLIENT_SECRET")
    subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
    resource_group = os.getenv("AZURE_RESOURCE_GROUP")
    workspace_name = os.getenv("AZURE_ML_WORKSPACE_NAME")
    
    # Create credential
    credential = ClientSecretCredential(
        tenant_id=tenant_id,
        client_id=client_id,
        client_secret=client_secret
    )
    
    # Create ML Client
    ml_client = MLClient(
        credential=credential,
        subscription_id=subscription_id,
        resource_group_name=resource_group,
        workspace_name=workspace_name
    )
    
    # Get Azure ML workspace
    mlflow_tracking_uri = ml_client.workspaces.get(workspace_name).mlflow_tracking_uri
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    
    print("Azure ML + MLflow Setup Complete:")
    print(f"   Workspace: {workspace_name}")
    print(f"   Subscription: {subscription_id}")
    print(f"   Resource Group: {resource_group}")
    print(f"   MLflow Tracking URI: {mlflow_tracking_uri}")
    
    return ml_client

def setup_mlflow_experiment():
    """Setup MLflow for RAG evaluation experiments with Azure ML"""
    
    # Setup Azure ML connection
    ml_client = setup_azure_ml_mlflow()
    
    # Create or set experiment
    experiment_name = "RAG-Chatbot-Evaluation"
    mlflow.set_experiment(experiment_name)
    
    print(f"MLflow experiment setup complete: {experiment_name}")
    
setup_mlflow_experiment()

  mlflow.mismatch._check_version_mismatch()
Class DeploymentTemplateOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Azure ML + MLflow Setup Complete:
   Workspace: RAG_demo
   Subscription: 81773e48-99ce-48da-a59b-605785d14817
   Resource Group: ai-grp
   MLflow Tracking URI: azureml://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo
MLflow experiment setup complete: RAG-Chatbot-Evaluation


## Create embeddings using pretrained models

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
q = "How can i get emotional support?"
v = model.encode(q)
len(v)

384

## Ingest data to AzureSearch (vector search)

In [34]:
#https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search
import os
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SearchableField
)

from tqdm import tqdm
import time

def connect_to_azure_search():
    """
    Connect to Azure AI Search using either API key or Azure AD authentication
    """
    for _ in range(10):  # Retry up to 10 times
        try:
            # Option 1: Using API Key (get from Azure portal)
            service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
            index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
            api_key = os.getenv("AZURE_SEARCH_API_KEY")
            
            index_client = SearchIndexClient(service_endpoint, AzureKeyCredential(api_key))
                
            # Create search client
            search_client = SearchClient(
                endpoint=service_endpoint,
                index_name=index_name,
                credential=AzureKeyCredential(api_key)
            )
            
            # Option 2: Using Azure AD Authentication (recommended for production)
            # credential = DefaultAzureCredential()
            # search_client = SearchClient(service_endpoint, index_name, credential)
            
            return search_client,index_client, index_name
            
        except Exception as e:
            print(f"Connection failed, retrying... ({e})")
            time.sleep(10)
    raise Exception("Failed to connect to Azure AI Search after several retries")
    

In [35]:
def create_azure_search_index(index_client, index_name):
    """
    Create Azure AI Search index with proper configuration
    """
    try:
        # Define fields - CRITICAL: 'id' field is required as key
        fields = [
            SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True),
            SimpleField(name="question_id", type=SearchFieldDataType.String, filterable=True, searchable=True),
            SearchableField(name="answer", type=SearchFieldDataType.String, searchable=True, analyzer_name="en.microsoft"),
            SearchableField(name="question", type=SearchFieldDataType.String, searchable=True, analyzer_name="en.microsoft"),
            SearchableField(name="question_title", type=SearchFieldDataType.String, searchable=True, analyzer_name="en.microsoft"),
            SearchableField(name="therapist_info", type=SearchFieldDataType.String, searchable=True, analyzer_name="en.microsoft"),
            SearchField(
                name="question_title_vector",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True,
                vector_search_dimensions=384,
                vector_search_profile_name="my-vector-profile"
            ),
            SearchField(
                name="question_vector", 
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True,
                vector_search_dimensions=384,
                vector_search_profile_name="my-vector-profile"
            ),
            SearchField(
                name="answer_vector",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True,
                vector_search_dimensions=384,
                vector_search_profile_name="my-vector-profile"
            ),
            SearchField(
                name="question_answer_vector",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True,
                vector_search_dimensions=384,
                vector_search_profile_name="my-vector-profile"
            ),
        ]

        # Configure vector search
        vector_search = VectorSearch(
            algorithms=[
                HnswAlgorithmConfiguration(
                    name="my-hnsw-config",
                    kind="hnsw",
                    parameters={
                        "m": 4,
                        "efConstruction": 400,
                        "efSearch": 500,
                        "metric": "cosine"
                    }
                )
            ],
            profiles=[
                VectorSearchProfile(
                    name="my-vector-profile",
                    algorithm_configuration_name="my-hnsw-config",
                )
            ]
        )

        # Create the index
        index = SearchIndex(
            name=index_name,
            fields=fields,
            vector_search=vector_search
        )
        
        # Create or update index
        result = index_client.create_or_update_index(index)
        print(f"Index '{result.name}' created/updated successfully")
        return result
        
    except Exception as e:
        print(f"Error creating index: {e}")
        raise
        
def create_embeddings_index_azure(search_client, documents):
    """
    Azure indexing - Convert ALL fields that are defined as strings in Azure
    """
    processed_docs = []
    
    print("Azure Indexing (converting ALL string fields)...")
    
    for doc in tqdm(documents, desc="Processing documents"):
        try:
            # CRITICAL: Convert ALL fields that are defined as strings in Azure schema
            question_id = str(doc.get('question_id', ''))  # Convert to string
            question_title = str(doc.get('question_title', ''))
            question = str(doc.get('question', ''))
            answer = str(doc.get('answer', ''))
            therapist_info = str(doc.get('therapist_info', ''))
            
            # Skip if missing critical fields
            if not question_id:
                print(f"Skipping document with no question_id: {doc}")
                continue
            
            # Create combined text for embeddings
            qa = f"{question_title} {question} {answer}"
            
            # Create embeddings
            question_title_vector = model.encode(question_title).tolist()
            question_vector = model.encode(question).tolist()
            answer_vector = model.encode(answer).tolist()
            question_answer_vector = model.encode(qa).tolist()
            
            # ALL non-vector fields must be strings (based on your Azure schema)
            processed_doc = {
                "id": question_id,  # Already string
                "question_id": question_id,  # MUST be string (not int)
                "answer": answer,
                "question": question, 
                "question_title": question_title,
                "therapist_info": therapist_info,
                "question_title_vector": question_title_vector,
                "question_vector": question_vector,
                "answer_vector": answer_vector,
                "question_answer_vector": question_answer_vector
            }
            
            processed_docs.append(processed_doc)
            
        except Exception as e:
            print(f"Error processing document: {e}")
            continue

    print(f"Processed {len(processed_docs)} documents")
    
    if not processed_docs:
        print("No documents to upload!")
        return 0
    
    # Upload in batches
    batch_size = 50
    successful_uploads = 0
    
    for i in range(0, len(processed_docs), batch_size):
        batch = processed_docs[i:i + batch_size]
        batch_num = (i // batch_size) + 1
        
        try:
            print(f"Uploading batch {batch_num} ({len(batch)} documents)...")
            
            result = search_client.upload_documents(documents=batch)
            
            batch_success = sum(1 for r in result if r.succeeded)
            successful_uploads += batch_success
            
            if batch_success == len(batch):
                print(f"Batch {batch_num}: All {len(batch)} uploaded")
            else:
                print(f"Batch {batch_num}: {batch_success}/{len(batch)} uploaded")
                
        except Exception as e:
            print(f"Batch {batch_num} upload failed: {e}")
            # Show sample of what we're trying to upload
            if batch:
                sample = batch[0]
                print(f"   Sample document types:")
                for key, value in sample.items():
                    if 'vector' not in key:
                        print(f"     {key}: {type(value)} = {str(value)[:50]}...")

    print(f"Upload completed: {successful_uploads}/{len(processed_docs)} documents")
    
    if successful_uploads > 0:
        verify_azure_data(search_client, processed_docs[:3])
    
    return successful_uploads


def verify_azure_data(search_client, sample_docs):
    """Verify data was uploaded correctly"""
    print("\n VERIFYING UPLOADED DATA")
    print("=" * 30)
    
    for doc in sample_docs:
        doc_id = doc['id']
        try:
            results = list(search_client.search(
                filter=f"id eq '{doc_id}'",
                top=1,
                select=["id", "question_id", "question"]
            ))
            
            if results:
                retrieved = results[0]
                print(f" Found: ID={retrieved['id']}, QID={retrieved['question_id']}")
                print(f"   Question: {retrieved.get('question', '')[:60]}...")
            else:
                print(f"NOT found: ID={doc_id}")
                
        except Exception as e:
            print(f"Error verifying {doc_id}: {e}")



### Vector Search AzureSearch

In [36]:
from azure.search.documents.models import VectorizedQuery

def azure_search(
    query_text, 
    search_type="vector",  # "vector", "hybrid", "text", OR "combined" (new!)
    vector_field="question_answer_vector", 
    k=5,
    search_fields=None,
    select_fields=None
):
    """
    Enhanced Azure search with Elasticsearch-style COMBINED vector search
    """
    search_client, _, _ = connect_to_azure_search()
    
    # Default fields
    if select_fields is None:
        select_fields = ["question_id", "question_title", "question", "answer", "therapist_info"]
    
    if search_fields is None:
        search_fields = ["question_title", "question", "answer"]
    
    # Initialize parameters
    search_text = ""
    vector_queries = []
    
    # NEW: Elasticsearch-style COMBINED vector search
    if search_type == "combined":
        query_vector = model.encode(query_text).tolist()
        
        # Create multiple vector queries like Elasticsearch's combined scoring
        vector_queries = [
            VectorizedQuery(
                vector=query_vector,
                k_nearest_neighbors=k,
                fields="question_title_vector",
                kind="vector"
            ),
            VectorizedQuery(
                vector=query_vector,
                k_nearest_neighbors=k,
                fields="question_vector", 
                kind="vector"
            ),
            VectorizedQuery(
                vector=query_vector,
                k_nearest_neighbors=k,
                fields="answer_vector",
                kind="vector"
            ),
            VectorizedQuery(
                vector=query_vector,
                k_nearest_neighbors=k, 
                fields="question_answer_vector",
                kind="vector"
            )
        ]
        search_text = ""  # Pure vector search
    
    # Handle regular vector search
    elif search_type in ["vector", "hybrid"]:
        query_vector = model.encode(query_text).tolist()
        vector_query = VectorizedQuery(
            vector=query_vector,
            k_nearest_neighbors=k,
            fields=vector_field,
            kind="vector"
        )
        vector_queries = [vector_query]
    
    # Handle text search  
    if search_type in ["hybrid", "text"]:
        search_text = query_text
    
    # Perform search
    try:
        results = search_client.search(
            search_text=search_text,
            vector_queries=vector_queries,
            search_fields=search_fields if search_type not in ["vector", "combined"] else None,
            select=select_fields,
            top=k
        )
        
        result_docs = list(results)
        return result_docs
        
    except Exception as e:
        print(f"Search error: {e}")
        return []

# Quick wrappers for common use cases
def azure_vector_search(query, k=5):
    return azure_search(query, "vector", k=k)

def azure_hybrid_search( query, k=5):
    return azure_search(query, "hybrid", k=k)

def azure_text_search(query, k=5):
    return azure_search(query, "text", k=k)

def azure_combined_search(query, k=5):
    """Elasticsearch-style combined vector search"""
    return azure_search(query, "combined", k=k)

In [37]:
# Main execution
import os
from dotenv import load_dotenv
import openai

load_dotenv()

def connect_azure_store_embeddings():
    
    # Create index client for management operations
    search_client,index_client, index_name = connect_to_azure_search()
    # Create index (run once)
    create_azure_search_index(index_client, index_name)
    
    # Load your documents (using your existing prepare_df function)
    df, quest_ans_df, ground_truth_gemma_df, ground_truth_llama_df = prepare_df(df_org)
    documents = df.to_dict(orient='records')
    
    # Ingest data with embeddings
    create_embeddings_index_azure(search_client, documents)


connect_azure_store_embeddings()

Index 'rag_search_db_demo' created/updated successfully
Azure Indexing (converting ALL string fields)...


Processing documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 99/99 [00:07<00:00, 12.52it/s]


Processed 99 documents
Uploading batch 1 (50 documents)...
Batch 1: All 50 uploaded
Uploading batch 2 (49 documents)...
Batch 2: All 49 uploaded
Upload completed: 99/99 documents

 VERIFYING UPLOADED DATA
 Found: ID=0, QID=0
   Question: I have so many issues to address. I have a history of sexual...
 Found: ID=0, QID=0
   Question: I have so many issues to address. I have a history of sexual...
 Found: ID=1, QID=1
   Question: I have been diagnosed with general anxiety and depression by...


In [38]:
# Searching examples
def search_azure():
    query = "How can i come out of depression?"
    
    print(f" Searching for: '{query}'\n")
    
    # 1. Pure Vector Search (semantic similarity)
    vector_results = azure_vector_search(query)
    print(f"Vector search found: {len(vector_results)} results")
    
    # Extract question IDs from vector results
    vector_question_ids = [result['question_id'] for result in vector_results]
    print(f"Question IDs: {vector_question_ids}")
    print()
    
    # 2. Hybrid Search (best of both worlds)  
    hybrid_results = azure_hybrid_search(query)
    print(f"Hybrid search found: {len(hybrid_results)} results")
    
    # Extract question IDs from hybrid results
    hybrid_question_ids = [result['question_id'] for result in hybrid_results]
    print(f"Question IDs: {hybrid_question_ids}")
    question = [result['question'] for result in hybrid_results]
    print(f" Question : {question}")
    print()
    
    # 3. Pure Text Search (traditional keyword)
    text_results = azure_text_search(query)
    print(f"Text search found: {len(text_results)} results")
    
    # Extract question IDs from text results
    text_question_ids = [result['question_id'] for result in text_results]
    question = [result['question'] for result in text_results]
    print(f"   Question IDs: {text_question_ids}")
    print(f"   Question : {question}")
    
# Run the search
results = search_azure()

 Searching for: 'How can i come out of depression?'

Vector search found: 5 results
Question IDs: ['20', '44', '30', '17', '52']

Hybrid search found: 5 results
Question IDs: ['20', '44', '17', '30', '52']
 Question : ["I'm in my late teens and live with my dad. The only time I go out is for my college classes. Sometimes when I see my friends I want to talk with them, but sometimes I won't want to talk to them for days or even weeks. Sometimes I feel i'm not worth knowing or i'm never going to do anything right. Are they right, am I depressed", "I have terrible anxiety and depression. I've tried various therapists and pills, but nothing's helped", 'I struggle with depression as well as pretty intense mood swings throughout the month. I experience highs where I feel amazing and energetic and then lows where I lack focus, energy, and generally have a more dark outlook on my life. How can I live a more balanced life', "In the last ten months, I've been kicked out, moved around three times

## Retrieval evaluation for different searches

In [39]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)
    
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function, col_name, boost=None):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['question_id']
        if(search_function == minsearch_search):
            results = search_function(q[col_name], boost)
        else:
            results = search_function(q[col_name])
            
        # FIX: Convert Azure string IDs to int for comparison with ground truth
        relevance = [int(d['question_id']) == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [42]:
def evaluate_with_mlflow_tracking(ground_truth, search_function, col_name, search_type, model_type, boost=None):
    """
    Enhanced evaluation with Azure ML MLflow tracking - FIXED VERSION
    """
    relevance_total = []
    
    run_name = f"{search_type}_{model_type}"
    
    with mlflow.start_run(run_name=run_name, nested=True):
        try:
            # Log parameters to Azure ML
            mlflow.log_params({
                'search_type': search_type,
                'model_type': model_type,
                'col_name': col_name,
                'ground_truth_size': len(ground_truth),
                'search_function': search_function.__name__,
                'embedding_model': 'multi-qa-MiniLM-L6-cos-v1',
                'vector_dimensions': 384,
                'retrieval_backend': 'Azure Cognitive Search',
                'authentication_method': 'service_principal'
            })
            
            # FIXED: Convert Azure string IDs to int for comparison with ground truth
            for q in tqdm(ground_truth, desc=f"Evaluating {search_type}"):
                doc_id = q['question_id']  # This is int from ground truth
                query_text = q[col_name]
                
                if search_function == minsearch_search:
                    results = search_function(query_text, boost)
                else:
                    results = search_function(query_text)
                
                # FIX: Convert Azure string IDs to int for comparison
                relevance = [int(d['question_id']) == doc_id for d in results]
                relevance_total.append(relevance)

            # Calculate metrics
            hit_rate_val = hit_rate(relevance_total)
            mrr_val = mrr(relevance_total)
            
            # Log metrics to Azure ML
            mlflow.log_metrics({
                'hit_rate': hit_rate_val,
                'mrr': mrr_val,
                'total_queries_evaluated': len(ground_truth)
            })
            
            # Log additional Azure ML specific tags
            mlflow.set_tags({
                'project': 'medical-rag-chatbot',
                'team': 'ai-engineering',
                'environment': 'poc',
                'azure_ml_tracked': 'true'
            })
            
            print(f" Azure ML logged: {search_type} on {model_type}")
            print(f"   Hit Rate: {hit_rate_val:.4f}, MRR: {mrr_val:.4f}")
            
            return {
                'hit_rate': hit_rate_val,
                'mrr': mrr_val,
                'search_type': search_type,
                'model_type': model_type,
                'run_id': mlflow.active_run().info.run_id
            }
            
        except Exception as e:
            mlflow.log_param('error', str(e))
            print(f"Evaluation failed: {e}")
            raise e
            
def run_retrieval_comparison_experiment():
    """
    Run comprehensive RAG evaluation with Azure ML tracking
    """
    # Parent run for the entire experiment
    with mlflow.start_run(run_name="RAG_Strategy_Comparison"):
        mlflow.log_params({
            "experiment_type": "search_strategy_comparison",
            "evaluation_framework": "custom_rag_metrics", 
            "azure_ml_integration": "true",
            "authentication": "service_principal"
        })
        
        all_results = []
        
        print("Starting Comprehensive RAG Evaluation with Azure ML...")
        
        # Test on Gemma-generated queries
        ground_truth_gemma = ground_truth_gemma_df.to_dict(orient='records')
        print(f"Evaluating on {len(ground_truth_gemma)} Gemma queries")
        
        for search_func, search_type in [
            (azure_text_search, "text_search"),
            (azure_vector_search, "vector_search"), 
            (azure_hybrid_search, "hybrid_search"),
            (azure_combined_search, "combined_search")
        ]:
            print(f"  Testing {search_type}...")
            result = evaluate_with_mlflow_tracking(
                ground_truth_gemma, 
                search_func, 
                'gemma_queries', 
                search_type, 
                'gemma'
            )
            all_results.append(result)
        
        # Test on Llama-generated queries  
        ground_truth_llama = ground_truth_llama_df.to_dict(orient='records')
        print(f"Evaluating on {len(ground_truth_llama)} Llama queries")
        
        for search_func, search_type in [
            (azure_text_search, "text_search"),
            (azure_vector_search, "vector_search"),
            (azure_hybrid_search, "hybrid_search"),
            (azure_combined_search, "combined_search")
        ]:
            print(f"  Testing {search_type}...")
            result = evaluate_with_mlflow_tracking(
                ground_truth_llama, 
                search_func, 
                'llama3_queries', 
                search_type, 
                'llama'
            )
            all_results.append(result)
        
        # Log final comparison metrics
        best_by_hit_rate = max(all_results, key=lambda x: x['hit_rate'])
        best_by_mrr = max(all_results, key=lambda x: x['mrr'])
        
        mlflow.log_metrics({
            'best_hit_rate': best_by_hit_rate['hit_rate'],
            'best_mrr': best_by_mrr['mrr']
        })
        
        mlflow.log_params({
            'best_hit_rate_config': f"{best_by_hit_rate['search_type']}_{best_by_hit_rate['model_type']}",
            'best_mrr_config': f"{best_by_mrr['search_type']}_{best_by_mrr['model_type']}"
        })
        
        print("\n AZURE ML EXPERIMENT RESULTS:")
        print("=" * 50)
        for result in all_results:
            print(f"{result['search_type']:15} | {result['model_type']:6} | "
                  f"HR: {result['hit_rate']:.4f} | MRR: {result['mrr']:.4f}")
        
        return all_results
    
run_retrieval_comparison_experiment()

Starting Comprehensive RAG Evaluation with Azure ML...
Evaluating on 214 Gemma queries
  Testing text_search...


Evaluating text_search: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 214/214 [00:20<00:00, 10.49it/s]


 Azure ML logged: text_search on gemma
   Hit Rate: 0.7290, MRR: 0.5565
üèÉ View run text_search_gemma at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525/runs/2682519b-ed6e-4e91-a308-97c48c269f9a
üß™ View experiment at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525
  Testing vector_search...


Evaluating vector_search: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 214/214 [00:22<00:00,  9.32it/s]


 Azure ML logged: vector_search on gemma
   Hit Rate: 0.8505, MRR: 0.7113
üèÉ View run vector_search_gemma at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525/runs/01e0d6f7-e71e-4982-b58d-5f38c279d58d
üß™ View experiment at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525
  Testing hybrid_search...


Evaluating hybrid_search: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 214/214 [00:25<00:00,  8.46it/s]


 Azure ML logged: hybrid_search on gemma
   Hit Rate: 0.8598, MRR: 0.7022
üèÉ View run hybrid_search_gemma at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525/runs/9cbd1cb5-1ae2-42ca-9179-091185444d47
üß™ View experiment at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525
  Testing combined_search...


Evaluating combined_search: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 214/214 [00:30<00:00,  7.02it/s]


 Azure ML logged: combined_search on gemma
   Hit Rate: 0.8551, MRR: 0.6822
üèÉ View run combined_search_gemma at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525/runs/43a1785a-f7e6-43e1-a8b8-d7c1b70a7dbf
üß™ View experiment at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525
Evaluating on 260 Llama queries
  Testing text_search...


Evaluating text_search: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 260/260 [00:25<00:00, 10.04it/s]


 Azure ML logged: text_search on llama
   Hit Rate: 0.6192, MRR: 0.4719
üèÉ View run text_search_llama at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525/runs/a2a19a05-71e8-4902-b824-5380831a7407
üß™ View experiment at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525
  Testing vector_search...


Evaluating vector_search: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 260/260 [00:27<00:00,  9.35it/s]


 Azure ML logged: vector_search on llama
   Hit Rate: 0.8077, MRR: 0.6324
üèÉ View run vector_search_llama at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525/runs/efde7c7e-d71c-40b2-8cc7-07cb5985e76f
üß™ View experiment at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525
  Testing hybrid_search...


Evaluating hybrid_search: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 260/260 [00:32<00:00,  7.98it/s]


 Azure ML logged: hybrid_search on llama
   Hit Rate: 0.8077, MRR: 0.6028
üèÉ View run hybrid_search_llama at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525/runs/528f728b-7449-402e-a30c-f1ccdb73e3a0
üß™ View experiment at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525
  Testing combined_search...


Evaluating combined_search: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 260/260 [00:37<00:00,  6.90it/s]


 Azure ML logged: combined_search on llama
   Hit Rate: 0.8269, MRR: 0.6211
üèÉ View run combined_search_llama at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525/runs/97243d3c-c89e-4c4c-a466-c9684233d44c
üß™ View experiment at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525

 AZURE ML EXPERIMENT RESULTS:
text_search     | gemma  | HR: 0.7290 | MRR: 0.5565
vector_search   | gemma  | HR: 0.8505 | MRR: 0.7113
hybrid_search   | gemma  | HR: 0.8598 | MRR: 0.7022
combined_search | gemma  | HR: 0.8551 | MRR: 0.6822
text_search     | llama  | HR: 0.6192 | MRR: 0.4719
vector_search   | llama  | HR: 0.8077 | MRR: 0.6324


[{'hit_rate': 0.7289719626168224,
  'mrr': 0.5564641744548285,
  'search_type': 'text_search',
  'model_type': 'gemma',
  'run_id': '2682519b-ed6e-4e91-a308-97c48c269f9a'},
 {'hit_rate': 0.8504672897196262,
  'mrr': 0.7112928348909656,
  'search_type': 'vector_search',
  'model_type': 'gemma',
  'run_id': '01e0d6f7-e71e-4982-b58d-5f38c279d58d'},
 {'hit_rate': 0.8598130841121495,
  'mrr': 0.7021806853582555,
  'search_type': 'hybrid_search',
  'model_type': 'gemma',
  'run_id': '9cbd1cb5-1ae2-42ca-9179-091185444d47'},
 {'hit_rate': 0.8551401869158879,
  'mrr': 0.6822429906542055,
  'search_type': 'combined_search',
  'model_type': 'gemma',
  'run_id': '43a1785a-f7e6-43e1-a8b8-d7c1b70a7dbf'},
 {'hit_rate': 0.6192307692307693,
  'mrr': 0.47185897435897445,
  'search_type': 'text_search',
  'model_type': 'llama',
  'run_id': 'a2a19a05-71e8-4902-b824-5380831a7407'},
 {'hit_rate': 0.8076923076923077,
  'mrr': 0.6324358974358973,
  'search_type': 'vector_search',
  'model_type': 'llama',
  'r

In [None]:
ground_truth = ground_truth_gemma_df.to_dict(orient='records')

print(evaluate(ground_truth, azure_text_search,'gemma_queries'))
print(evaluate(ground_truth, azure_vector_search,'gemma_queries'))
print(evaluate(ground_truth, azure_hybrid_search,'gemma_queries'))
print(evaluate(ground_truth, azure_combined_search,'gemma_queries'))


ground_truth = ground_truth_llama_df.to_dict(orient='records')
print(evaluate(ground_truth, azure_text_search,'llama3_queries'))
print(evaluate(ground_truth, azure_vector_search,'llama3_queries'))
print(evaluate(ground_truth, azure_hybrid_search,'llama3_queries'))
print(evaluate(ground_truth, azure_combined_search,'llama3_queries'))

## RAG flow

### LLM Config / Embedding setup

In [60]:
import os
from dotenv import load_dotenv
import openai

load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')
AZURE_API_KEY_gpt_4_1_mini= os.getenv('AZURE_GPT_4_1_MINI_KEY')
AZURE_O3_MINI_KEY = os.getenv('AZURE_O3_MINI_KEY')
AZURE_GPT_5_MINI_KEY = os.getenv('AZURE_GPT_5_MINI_KEY')

In [104]:
#https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/create-resource?pivots=web-portal
from openai import AzureOpenAI
from IPython.display import display, Markdown

def build_prompt(query, search_results):
    prompt_template = """
    You're a therapist AI assistant focusing on responding to depression related user queries.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()

    context = ""
    #print(search_results)
    for doc in search_results:
        context += f"""
        Question Title: {doc['question_title']}
        Question: {doc['question']}
        Answer: {doc['answer']}
        Therapist : {doc['therapist_info']}
        """ 

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def get_openai_response_content(response):
    """
    Safely extract content from Azure OpenAI response
    """
    try:
        content = response.choices[0].message.content
        return content
    except AttributeError as e:
        print(f"Error extracting content: {e}")
        return None
    
def openai_4_1_mini_temp_0_8(prompt):
    """
    Send the prompt to Azure OpenAI and get the model's response. This uses detailed context
    to improve the quality of the AI's answer.
    """
    endpoint = "https://padma-mhrfm98m-eastus2.cognitiveservices.azure.com/"
    model_name = "gpt-4.1-mini"
    deployment = "gpt-4.1-mini"

    api_version = "2024-12-01-preview"

    subscription_key = AZURE_API_KEY_gpt_4_1_mini
    client = AzureOpenAI(
        api_version=api_version,
        azure_endpoint=endpoint,
        api_key=subscription_key,
    )

    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        max_completion_tokens=256,
        temperature=0.8,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        model=deployment
    )

    return get_openai_response_content(response)

def openai_4_1_mini_temp_0_5(prompt):
    """
    Send the prompt to Azure OpenAI and get the model's response. This uses detailed context
    to improve the quality of the AI's answer.
    """
    endpoint = "https://padma-mhrfm98m-eastus2.cognitiveservices.azure.com/"
    model_name = "gpt-4.1-mini"
    deployment = "gpt-4.1-mini"

    api_version = "2024-12-01-preview"

    subscription_key = AZURE_API_KEY_gpt_4_1_mini
    client = AzureOpenAI(
        api_version=api_version,
        azure_endpoint=endpoint,
        api_key=subscription_key,
    )

    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        max_completion_tokens=256,
        temperature=0.5,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        model=deployment
    )

    return get_openai_response_content(response)


def openai_gpt_5_mini(prompt):
    """
    Send the prompt to Azure OpenAI and get the model's response. This uses detailed context
    to improve the quality of the AI's answer.
    """
    endpoint = "https://padma-mhrfm98m-eastus2.cognitiveservices.azure.com/"
    model_name = "gpt-5-mini"
    deployment = "gpt-5-mini"

    api_version = "2024-12-01-preview"

    subscription_key = AZURE_GPT_5_MINI_KEY
    client = AzureOpenAI(
        api_version=api_version,
        azure_endpoint=endpoint,
        api_key=subscription_key,
    )

    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        max_completion_tokens=16384,
        model=deployment
    )

    return get_openai_response_content(response)
    
    

def azure_o3_mini(prompt):
    """
    Send the prompt to Azure o3-mini and get the model's response. This uses detailed context
    to improve the quality of the AI's answer.
    """
    endpoint = "https://padma-mhrfm98m-eastus2.cognitiveservices.azure.com/"
    model_name = "o3-mini"
    deployment = "o3-mini"

    subscription_key = AZURE_O3_MINI_KEY 
    api_version = "2024-12-01-preview"

    client = AzureOpenAI(
        api_version=api_version,
        azure_endpoint=endpoint,
        api_key=subscription_key,
    )

    response = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    max_completion_tokens=1024,
    model=deployment
       
    )

    return response.choices[0].message.content

def llama3(prompt):
    """
    Send the prompt to llama3 from ollama and get the model's response. This uses detailed context
    to improve the quality of the AI's answer.
    """
    model = Ollama(model="llama3", temperature=0.7)

    # Generate response
    response = model.invoke(prompt, max_tokens=512) 
    print(response)
    print("*****")
    answer = re.sub(r'.*answer:', '', response).strip()
    answer = answer.replace('*','')
    
    return answer

def gemma2(prompt):
    """
    Send the prompt to gemma2 from ollama and get the model's response. This uses detailed context
    to improve the quality of the AI's answer.
    """
    model = Ollama(model="gemma2:2b", temperature=0.7)

    # Generate response
    response = model.invoke(prompt, max_tokens=512)
    response = response.replace('*','')
    return response


def rag(query, retrieval_search_function_name=minsearch_search, llm_name=llama3,boost = {}):
    if(retrieval_search_function_name==minsearch_search):
        search_results = retrieval_search_function_name(query,boost)
    else:
        search_results = retrieval_search_function_name(query)
    prompt = build_prompt(query, search_results)
    answer = llm_name(prompt)
    return answer

### Check the RAG Response

In [45]:
## Use azuresearch for retrival and Generate answer with openai_4_1_mini_temp_0_8
query = "How can i come out of depression?"
answer = rag(query,azure_hybrid_search, openai_4_1_mini_temp_0_8)
display(Markdown(answer))  # To handle * and \n 

It sounds like you're going through a really tough time, and wanting to come out of depression is an important first step. From the information provided, here are some ways that might help you begin to lift your mood:

- Take care of your body through exercise, eating healthy foods, and getting enough sleep. These basics can have a big impact on how you feel.
- Share your feelings with someone you trust, whether it's a family member, friend, or counselor. Talking about what you‚Äôre experiencing can be very helpful.
- Try to reduce social media exposure if you notice it worsens your mood.
- Consider seeking professional help by talking to a counselor or doctor who knows your history. They can help determine if you are experiencing depression and discuss options like counseling or therapy.
- Cognitive behavioral therapy (CBT) is a helpful approach that teaches tools to challenge and change negative thoughts like "I'm not worth knowing" or "I can't do anything right."
- Building a new routine incorporating activities like meditation, getting sunlight, fresh air, and hydration can also support your mental health.

Remember, depression can make motivation low, so starting small and being patient with yourself is important. If you feel stuck despite trying these things, reaching out for professional support is a strong and hopeful step. You're

In [46]:
## Use azuresearch for retrival and Generate answer with openai_4_1_mini_temp_0_5
query = "How can i come out of depression?"
answer = rag(query,azure_hybrid_search, openai_4_1_mini_temp_0_5)
display(Markdown(answer))  # To handle * and \n 

Coming out of depression can be challenging, but there are several steps and strategies that might help you start feeling better:

1. **Seek Professional Help:** If you find that sadness or low mood is persistent and you cannot find things to get excited about or look forward to, it may be time to reach out to a counselor, therapist, or doctor who knows your medical history. They can help determine if you are experiencing depression and guide you toward appropriate treatment options.

2. **Healthy Lifestyle Habits:** Engaging in regular exercise, eating healthy foods, getting adequate sleep, and drinking enough water can help improve your mood. These basic self-care activities support your overall well-being.

3. **Share Your Feelings:** Talk with someone you trust about how you are feeling. Sharing your thoughts and emotions can relieve some of the burden and help you feel less isolated.

4. **Limit Social Media Exposure:** Sometimes reducing time spent on social media can decrease negative feelings or comparisons that worsen depression.

5. **Consider Cognitive Behavioral Therapy (CBT):** CBT can help you recognize and challenge negative thoughts like "I am not worth knowing" or "I cannot do anything right." Learning tools to fight these thoughts can gradually silence them.

6. **Be Patient and Keep Trying:** Depression can

In [47]:
## Use azuresearch for retrival and Generate answer with azure_o3_mini
query = "How can i come out of depression?"
answer = rag(query,azure_hybrid_search, azure_o3_mini)
display(Markdown(answer))  # To handle * and \n 

It can be really tough feeling stuck, and I‚Äôm sorry you‚Äôre going through this. While every person‚Äôs journey out of depression is unique, there are several approaches that others in similar situations have found helpful:

‚Ä¢ Sometimes, when negative moods or thoughts persist, one helpful step is to reach out for professional support. Talking with a counselor or doctor who knows your history can help you better understand your feelings and work on strategies to change them. Therapy‚Äîespecially cognitive behavior therapy‚Äîcan provide tools to quiet the negative thoughts that tell you you‚Äôre not worth it.

‚Ä¢ Many find that making small, manageable changes can start shifting feelings over time. Focusing on aspects of your life that you can control‚Äîlike getting into a regular routine, paying attention to your nutrition, exercise, sleep, and even reducing time on social media‚Äîcan slowly begin to lift your mood.

‚Ä¢ Sometimes depression feels like a part of who you are because you‚Äôve been in that ‚Äúmode‚Äù for a long time. In these cases, gently exploring what it might be like to embrace new, more positive experiences, even if they feel unfamiliar at first, can be a step toward change.

‚Ä¢ Lastly, know that it might take time to find the right mix of support, whether that‚Äôs a specific kind of therapy, medication, or lifestyle adjustments. It‚Äôs completely okay to try different options until you find what works best for you

## RAG Answer Generation

### Generate response with AzureSearch retrival, OpenAI, Azure o3 mini

In [52]:
def evaluate_llms_with_mlflow():
    """
    LLM evaluation using basic RAG function
    """
    
    with mlflow.start_run(run_name="LLM_Comparison_Simple"):
        mlflow.log_param("experiment_type", "llm_comparison_simple")
        
        llm_configs = [
            (openai_4_1_mini_temp_0_8, "openai_4_1_mini_temp_0_8"),
            (openai_4_1_mini_temp_0_5, "openai_4_1_mini_temp_0_5"), 
            (azure_o3_mini, "azure_o3_mini")
        ]
        
        all_results = {}
        
        for llm_func, llm_name in llm_configs:
            print(f"\nTesting {llm_name}...")
            
            try:
                sample_questions = quest_ans_df.head(5)  # Smaller sample for testing
                response_times = []
                response_lengths = []
                successful = 0
                
                for idx, row in sample_questions.iterrows():
                    start_time = time.time()
                    
                    try:
                        # Use existing RAG function
                        response = rag(row['question'], azure_hybrid_search, llm_func)
                        end_time = time.time()
                        
                        if response and len(response.strip()) > 10: 
                            response_times.append(end_time - start_time)
                            response_lengths.append(len(response))
                            successful += 1
                        else:
                            response_times.append(end_time - start_time)
                            response_lengths.append(0)
                            
                    except Exception as e:
                        print(f"Error with {llm_name} on question {idx}: {e}")
                        response_times.append(0)
                        response_lengths.append(0)
                
                # Calculate metrics
                success_rate = successful / len(sample_questions)
                avg_time = np.mean(response_times) if response_times else 0
                avg_length = np.mean(response_lengths) if response_lengths else 0
                
                # Log to MLflow
                with mlflow.start_run(run_name=f"{llm_name}_simple", nested=True):
                    mlflow.log_metrics({
                        'success_rate': success_rate,
                        'avg_processing_time': avg_time,
                        'avg_response_length': avg_length
                    })
                    mlflow.log_param('llm_model', llm_name)
                
                all_results[llm_name] = {
                    'success_rate': success_rate,
                    'avg_processing_time': avg_time,
                    'avg_response_length': avg_length
                }
                
                print(f"{llm_name}: Success rate: {success_rate:.2%}")
                
            except Exception as e:
                print(f"{llm_name} failed: {e}")
                all_results[llm_name] = {'error': str(e)}
        
        return all_results

# Run the evaluation
print(" Starting LLM Evaluation...")
results = evaluate_llms_with_mlflow()
results

 Starting LLM Evaluation...

Testing openai_4_1_mini_temp_0_8...
üèÉ View run openai_4_1_mini_temp_0_8_simple at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525/runs/264a4e87-d985-4954-920f-8a697fb39056
üß™ View experiment at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525
openai_4_1_mini_temp_0_8: Success rate: 100.00%

Testing openai_4_1_mini_temp_0_5...
üèÉ View run openai_4_1_mini_temp_0_5_simple at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-

{'openai_4_1_mini_temp_0_8': {'success_rate': 1.0,
  'avg_processing_time': 9.245586109161376,
  'avg_response_length': 922.8},
 'openai_4_1_mini_temp_0_5': {'success_rate': 1.0,
  'avg_processing_time': 9.590905046463012,
  'avg_response_length': 950.4},
 'azure_o3_mini': {'success_rate': 1.0,
  'avg_processing_time': 7.807274246215821,
  'avg_response_length': 993.6}}

In [55]:
#We take first 40 question and get response from llm
quest_ans_df = quest_ans_df[0:40]
quest_ans_df

Unnamed: 0,question_id,question_title,question,true_answer
0,0,Do I have too many issues for counseling,I have so many issues to address. I have a his...,It is very common for people to have multiple ...
1,1,My apartment manager won't let me keep an emot...,I have been diagnosed with general anxiety and...,"This can be a difficult situation. Typically, ..."
2,2,I feel like my mother doesn't support me,My mother is combative with me when I say I do...,Do you live with your mom and have constant in...
3,3,Why do I feel like I don't belong anywhere,There are many people willing to lovingly prov...,I truly understand what you are saying. I want...
4,4,How can I help my girlfriend,My girlfriend just quit drinking and she becam...,You're probably not going to like my answer.Yo...
5,5,Can i learn to be happy alone,I'm dealing with an illness that will never go...,The power of acceptance is key! Changing your ...
6,6,How can I deal with my posttraumatic stress di...,"I feel angry, anxious, and depressed. The PTSD...",I second the suggestion to find a therapist wh...
7,7,How do I know if I have depression,I had a very troubled up bringing and I'm curr...,Thanks for reaching out with this important qu...
8,8,How do I make new friends,"In the past year, two of my best and only clos...",I am sending your loving vibes as you weather ...
9,10,How do I stop feeling empty,I don't know how else to explain it. All I can...,Why do I feel empty?Feelings of emptiness‚Äîa la...


In [56]:
'''
Use azuresearch for retrival on each row and generate the answer with openai_4_1_mini_temp_0_8,
openai_4_1_mini_temp_0_5 and azure_o3_mini
'''
tqdm.pandas()  # To use progress bar with pandas

quest_ans_df['openai_4_1_mini_temp_0_8'] = quest_ans_df.progress_apply(
    lambda row: rag(row['question'], azure_hybrid_search, openai_4_1_mini_temp_0_8),  axis=1)

quest_ans_df['openai_4_1_mini_temp_0_5'] = quest_ans_df.progress_apply(
    lambda row: rag(row['question'], azure_hybrid_search, openai_4_1_mini_temp_0_5),  axis=1)


quest_ans_df['azure_o3_mini'] = quest_ans_df.progress_apply(
    lambda row: rag(row['question'], azure_hybrid_search, azure_o3_mini),  axis=1)

quest_ans_df_doc = quest_ans_df.to_dict(orient = 'records')

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40/40 [05:50<00:00,  8.76s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quest_ans_df['openai_4_1_mini_temp_0_8'] = quest_ans_df.progress_apply(
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40/40 [06:22<00:00,  9.57s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,c

## RAG Evaluation

### LLM as judge

In [97]:
!pip install evaluate rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting absl-py (from rouge_score)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting nltk (from rouge_score)
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 20.2 MB/s eta 0:00:00
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py): started
  Building wheel for rouge_score (setup.py): finished with status 'done'
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24972 sha256=74a320109f26ea556654fc51041b662a8b60b436650acda76fccde47d7956d3d
  Stored in directory: c:\users\padma\appdata\l

In [108]:
import evaluate
from rouge_score import rouge_scorer

def comprehensive_evaluation(input_doc, answer_col, true_answer_col='true_answer'):
    """
    Comprehensive evaluation using 4 methods:
    1. Semantic Similarity
    2. Keyword Overlap  
    3. ROUGE Score
    4. LLM as Judge
    """
    semantic_scores = []
    keyword_scores = []
    rouge_scores = []
    llm_judgments = []
    
    # Initialize ROUGE scorer
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    for rec in tqdm(input_doc, desc=f"Evaluating {answer_col}"):
        generated_answer = str(rec.get(answer_col, ''))
        true_answer = str(rec.get(true_answer_col, ''))
        question = str(rec.get('question', ''))
        
        # 1. Semantic Similarity
        semantic_score = calculate_semantic_similarity(generated_answer, true_answer)
        semantic_scores.append(semantic_score)
        
        # 2. Keyword Overlap
        keyword_score = calculate_keyword_overlap(generated_answer, true_answer)
        keyword_scores.append(keyword_score)
        
        # 3. ROUGE Score
        rouge_result = rouge.score(true_answer, generated_answer)
        rouge1 = rouge_result['rouge1'].fmeasure
        rouge_scores.append(rouge1)
        
        # 4. LLM as Judge (sampled to avoid rate limits)
        if len(llm_judgments) < 50:  # Sample first 20 for LLM evaluation
            llm_judgment = llm_judge_relevance(question, generated_answer, true_answer)
            llm_judgments.append(llm_judgment)
        else:
            llm_judgments.append("NOT_EVALUATED")
    
    return {
        'semantic_scores': semantic_scores,
        'keyword_scores': keyword_scores, 
        'rouge_scores': rouge_scores,
        'llm_judgments': llm_judgments
    }

def calculate_semantic_similarity(text1, text2):
    """Calculate semantic similarity using embeddings"""
    if not text1 or not text2:
        return 0.0
    try:
        emb1 = model.encode(text1)
        emb2 = model.encode(text2)
        similarity = cosine_similarity([emb1], [emb2])[0][0]
        return float(similarity)
    except:
        return 0.0

def calculate_keyword_overlap(text1, text2):
    """Calculate keyword overlap between texts"""
    if not text1 or not text2:
        return 0.0
    
    # Simple word overlap
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    
    if not words1 or not words2:
        return 0.0
    
    overlap = len(words1.intersection(words2))
    return overlap / max(len(words1), len(words2))

def llm_judge_relevance(question, generated_answer, true_answer):
    """
    Use LLM as judge to evaluate relevance
    """
    try:
        prompt = f"""
        Evaluate if the generated answer properly addresses the question and matches the reference answer.
        
        QUESTION: {question}
        REFERENCE ANSWER: {true_answer}
        GENERATED ANSWER: {generated_answer}
        
        Is the generated answer relevant? Answer only: RELEVANT, PARTLY_RELEVANT, or NON_RELEVANT
        """
        
        response = openai_gpt_5_mini(prompt)
        response = response.strip().upper()
        
        if "RELEVANT" in response:
            return "RELEVANT"
        elif "PARTLY" in response:
            return "PARTLY_RELEVANT"
        else:
            return "NON_RELEVANT"
            
    except Exception as e:
        return f"ERROR: {str(e)}"

def evaluate_all_llms_comprehensive():
    """
    Comprehensive evaluation of all 3 LLMs using 4 methods
    """
    setup_mlflow_experiment()
    
    llm_columns = ['openai_4_1_mini_temp_0_8', 'openai_4_1_mini_temp_0_5', 'azure_o3_mini']
    all_results = {}
    
    with mlflow.start_run(run_name="Comprehensive_LLM_Evaluation"):
        mlflow.log_param("evaluation_methods", "semantic,keyword,rouge,llm_judge")
        mlflow.log_param("llm_configurations", llm_columns)
        
        print(" COMPREHENSIVE LLM EVALUATION")
        print("=" * 50)
        
        for col in llm_columns:
            print(f"\nEvaluating {col}...")
            
            # Run comprehensive evaluation
            results = comprehensive_evaluation(quest_ans_df_doc, col)
            all_results[col] = results
            
            # Calculate aggregate metrics
            avg_semantic = np.mean(results['semantic_scores'])
            avg_keyword = np.mean(results['keyword_scores'])
            avg_rouge = np.mean(results['rouge_scores'])
            
            # LLM judgment counts
            llm_judgments = [j for j in results['llm_judgments'] if j in ['RELEVANT', 'PARTLY_RELEVANT', 'NON_RELEVANT']]
            if llm_judgments:
                llm_relevant_rate = llm_judgments.count('RELEVANT') / len(llm_judgments)
            else:
                llm_relevant_rate = 0
            
            # Log to MLflow
            mlflow.log_metrics({
                f"{col}_avg_semantic": avg_semantic,
                f"{col}_avg_keyword": avg_keyword,
                f"{col}_avg_rouge": avg_rouge,
                f"{col}_llm_relevant_rate": llm_relevant_rate
            })
            
            # Add to dataframe
            quest_ans_df[f'{col}_semantic'] = results['semantic_scores']
            quest_ans_df[f'{col}_keyword'] = results['keyword_scores']
            quest_ans_df[f'{col}_rouge'] = results['rouge_scores']
            quest_ans_df[f'{col}_llm_judge'] = results['llm_judgments']
            
            print(f" {col} Results:")
            print(f"   Semantic Similarity: {avg_semantic:.3f}")
            print(f"   Keyword Overlap: {avg_keyword:.3f}")
            print(f"   ROUGE-1 Score: {avg_rouge:.3f}")
            print(f"   LLM Judge Relevant: {llm_relevant_rate:.1%}")
        
        # Generate comparison summary
        comparison_summary(all_results)
        
        return all_results

def comparison_summary(all_results):
    """Generate comparison summary of all LLMs"""
    print(f"\n COMPARISON SUMMARY")
    print("=" * 60)
    
    summary_data = []
    for col, results in all_results.items():
        avg_semantic = np.mean(results['semantic_scores'])
        avg_keyword = np.mean(results['keyword_scores']) 
        avg_rouge = np.mean(results['rouge_scores'])
        
        llm_judgments = [j for j in results['llm_judgments'] if j in ['RELEVANT', 'PARTLY_RELEVANT', 'NON_RELEVANT']]
        llm_relevant = llm_judgments.count('RELEVANT') / len(llm_judgments) if llm_judgments else 0
        
        summary_data.append({
            'LLM': col,
            'Semantic': avg_semantic,
            'Keyword': avg_keyword,
            'ROUGE': avg_rouge,
            'LLM_Judge': llm_relevant
        })
        
        print(f"{col:25} | Semantic: {avg_semantic:.3f} | Keyword: {avg_keyword:.3f} | ROUGE: {avg_rouge:.3f} | LLM: {llm_relevant:.1%}")
    
    # Find best by each metric
    df_summary = pd.DataFrame(summary_data)
    
    best_semantic = df_summary.loc[df_summary['Semantic'].idxmax(), 'LLM']
    best_rouge = df_summary.loc[df_summary['ROUGE'].idxmax(), 'LLM']
    best_llm_judge = df_summary.loc[df_summary['LLM_Judge'].idxmax(), 'LLM']
    
    print(f"\n  BEST BY METRIC:")
    print(f"   Semantic Similarity: {best_semantic}")
    print(f"   ROUGE Score: {best_rouge}") 
    print(f"   LLM Judge: {best_llm_judge}")
    
    return df_summary

In [110]:
# Run comprehensive evaluation
import warnings
warnings.filterwarnings('ignore')

# Suppress specific warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)


print("Starting Comprehensive Evaluation...")
all_results = evaluate_all_llms_comprehensive()

# Show detailed results for each LLM
for col in ['openai_4_1_mini_temp_0_8', 'openai_4_1_mini_temp_0_5', 'azure_o3_mini']:
    print(f"\n {col} Detailed Results:")
    print(f"  Semantic Similarity Range: {np.min(quest_ans_df[f'{col}_semantic']):.3f} - {np.max(quest_ans_df[f'{col}_semantic']):.3f}")
    print(f"  Keyword Overlap Range: {np.min(quest_ans_df[f'{col}_keyword']):.3f} - {np.max(quest_ans_df[f'{col}_keyword']):.3f}")
    print(f"  ROUGE Score Range: {np.min(quest_ans_df[f'{col}_rouge']):.3f} - {np.max(quest_ans_df[f'{col}_rouge']):.3f}")
    
    llm_judges = quest_ans_df[f'{col}_llm_judge'].value_counts()
    print(f"  LLM Judgments: {llm_judges.to_dict()}")

Starting Comprehensive Evaluation...




Azure ML + MLflow Setup Complete:
   Workspace: RAG_demo
   Subscription: 81773e48-99ce-48da-a59b-605785d14817
   Resource Group: ai-grp
   MLflow Tracking URI: azureml://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo
MLflow experiment setup complete: RAG-Chatbot-Evaluation
 COMPREHENSIVE LLM EVALUATION

Evaluating openai_4_1_mini_temp_0_8...


Evaluating openai_4_1_mini_temp_0_8:   0%|          | 0/40 [00:00<?, ?it/s]

 openai_4_1_mini_temp_0_8 Results:
   Semantic Similarity: 0.787
   Keyword Overlap: 0.309
   ROUGE-1 Score: 0.453
   LLM Judge Relevant: 100.0%

Evaluating openai_4_1_mini_temp_0_5...


Evaluating openai_4_1_mini_temp_0_5:   0%|          | 0/40 [00:00<?, ?it/s]

 openai_4_1_mini_temp_0_5 Results:
   Semantic Similarity: 0.789
   Keyword Overlap: 0.311
   ROUGE-1 Score: 0.463
   LLM Judge Relevant: 100.0%

Evaluating azure_o3_mini...


Evaluating azure_o3_mini:   0%|          | 0/40 [00:00<?, ?it/s]

 azure_o3_mini Results:
   Semantic Similarity: 0.729
   Keyword Overlap: 0.270
   ROUGE-1 Score: 0.414
   LLM Judge Relevant: 100.0%

 COMPARISON SUMMARY
openai_4_1_mini_temp_0_8  | Semantic: 0.787 | Keyword: 0.309 | ROUGE: 0.453 | LLM: 100.0%
openai_4_1_mini_temp_0_5  | Semantic: 0.789 | Keyword: 0.311 | ROUGE: 0.463 | LLM: 100.0%
azure_o3_mini             | Semantic: 0.729 | Keyword: 0.270 | ROUGE: 0.414 | LLM: 100.0%

  BEST BY METRIC:
   Semantic Similarity: openai_4_1_mini_temp_0_5
   ROUGE Score: openai_4_1_mini_temp_0_5
   LLM Judge: openai_4_1_mini_temp_0_8
üèÉ View run Comprehensive_LLM_Evaluation at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce-48da-a59b-605785d14817/resourceGroups/ai-grp/providers/Microsoft.MachineLearningServices/workspaces/RAG_demo/#/experiments/bad663e6-8388-4137-ba00-3f6a362c7525/runs/72e6ddd5-a50a-470f-bde2-5365eaf3e4ac
üß™ View experiment at: https://southeastasia.api.azureml.ms/mlflow/v2.0/subscriptions/81773e48-99ce

In [111]:
def log_complete_evaluation_summary():
    """
    Log complete evaluation summary including all metrics and comparisons
    """
    with mlflow.start_run(run_name="Complete_RAG_Evaluation"):
        # Log basic parameters
        mlflow.log_params({
            "evaluation_timestamp": datetime.now().isoformat(),
            "total_questions": len(quest_ans_df),
            "llm_configurations": "openai_4_1_mini_temp_0_8, openai_4_1_mini_temp_0_5, azure_o3_mini",
            "evaluation_framework": "multi_method_rag_evaluation"
        })
        
        summary_data = []
        
        for col in ['openai_4_1_mini_temp_0_8', 'openai_4_1_mini_temp_0_5', 'azure_o3_mini']:
            # Calculate all metrics
            avg_semantic = quest_ans_df[f'{col}_semantic'].mean()
            avg_keyword = quest_ans_df[f'{col}_keyword'].mean()
            avg_rouge = quest_ans_df[f'{col}_rouge'].mean()
            
            # Response length metrics
            response_lengths = quest_ans_df[col].str.len()
            avg_response_length = response_lengths.mean()
            
            # LLM Judge results
            llm_judges = quest_ans_df[f'{col}_llm_judge'].value_counts()
            llm_relevant = llm_judges.get('RELEVANT', 0)
            llm_partly = llm_judges.get('PARTLY_RELEVANT', 0)
            llm_non = llm_judges.get('NON_RELEVANT', 0)
            total_llm_judged = llm_relevant + llm_partly + llm_non
            llm_relevant_rate = llm_relevant / total_llm_judged if total_llm_judged > 0 else 0
            
            # Create comprehensive row
            row = {
                'LLM': col,
                'Avg_Semantic': avg_semantic,
                'Avg_Keyword': avg_keyword,
                'Avg_ROUGE': avg_rouge,
                'Avg_Response_Length': avg_response_length,
                'LLM_Relevant_Count': llm_relevant,
                'LLM_Relevant_Rate': llm_relevant_rate,
                'LLM_Partly_Count': llm_partly,
                'LLM_Non_Count': llm_non
            }
            summary_data.append(row)
            
            # Log detailed metrics for each LLM
            mlflow.log_metrics({
                f"{col}_semantic_score": avg_semantic,
                f"{col}_keyword_score": avg_keyword,
                f"{col}_rouge_score": avg_rouge,
                f"{col}_response_length": avg_response_length,
                f"{col}_llm_relevant_rate": llm_relevant_rate,
                f"{col}_llm_relevant_count": llm_relevant
            })
        
        # Create and log summary table
        summary_df = pd.DataFrame(summary_data)
        mlflow.log_table(summary_df, "complete_evaluation_summary.json")
        
        # Calculate and log comparison metrics
        best_overall = summary_df.loc[summary_df['LLM_Relevant_Rate'].idxmax()]
        worst_overall = summary_df.loc[summary_df['LLM_Relevant_Rate'].idxmin()]
        
        mlflow.log_metrics({
            "best_overall_score": best_overall['LLM_Relevant_Rate'],
            "worst_overall_score": worst_overall['LLM_Relevant_Rate'],
            "avg_semantic_across_llms": summary_df['Avg_Semantic'].mean(),
            "avg_rouge_across_llms": summary_df['Avg_ROUGE'].mean()
        })
        
        mlflow.log_params({
            "best_overall_llm": best_overall['LLM'],
            "worst_overall_llm": worst_overall['LLM'],
            "best_semantic_llm": summary_df.loc[summary_df['Avg_Semantic'].idxmax(), 'LLM'],
            "best_rouge_llm": summary_df.loc[summary_df['Avg_ROUGE'].idxmax(), 'LLM']
        })
        
        # Print comprehensive summary
        print("\n" + "="*80)
        print(" COMPLETE RAG EVALUATION SUMMARY (Logged to MLflow)")
        print("="*80)
        print(summary_df.round(3))
        
        print(f"\n PERFORMANCE HIGHLIGHTS:")
        print(f"   Best Overall: {best_overall['LLM']} ({best_overall['LLM_Relevant_Rate']:.1%} relevant)")
        print(f"   Best Semantic: {summary_df.loc[summary_df['Avg_Semantic'].idxmax(), 'LLM']} ({summary_df['Avg_Semantic'].max():.3f})")
        print(f"   Best ROUGE: {summary_df.loc[summary_df['Avg_ROUGE'].idxmax(), 'LLM']} ({summary_df['Avg_ROUGE'].max():.3f})")
        print(f"   Average Response Length: {summary_df['Avg_Response_Length'].mean():.0f} chars")
        
        return summary_df

# Run complete evaluation summary
complete_summary = log_complete_evaluation_summary()


 COMPLETE RAG EVALUATION SUMMARY (Logged to MLflow)
                        LLM  Avg_Semantic  Avg_Keyword  Avg_ROUGE  \
0  openai_4_1_mini_temp_0_8         0.787        0.309      0.453   
1  openai_4_1_mini_temp_0_5         0.789        0.311      0.463   
2             azure_o3_mini         0.729        0.270      0.414   

   Avg_Response_Length  LLM_Relevant_Count  LLM_Relevant_Rate  \
0              1071.95                  40                1.0   
1              1108.55                  40                1.0   
2              1204.55                  40                1.0   

   LLM_Partly_Count  LLM_Non_Count  
0                 0              0  
1                 0              0  
2                 0              0  

 PERFORMANCE HIGHLIGHTS:
   Best Overall: openai_4_1_mini_temp_0_8 (100.0% relevant)
   Best Semantic: openai_4_1_mini_temp_0_5 (0.789)
   Best ROUGE: openai_4_1_mini_temp_0_5 (0.463)
   Average Response Length: 1128 chars
üèÉ View run Complete_RAG_Evaluation

In [112]:
complete_summary

Unnamed: 0,LLM,Avg_Semantic,Avg_Keyword,Avg_ROUGE,Avg_Response_Length,LLM_Relevant_Count,LLM_Relevant_Rate,LLM_Partly_Count,LLM_Non_Count
0,openai_4_1_mini_temp_0_8,0.787033,0.308943,0.452635,1071.95,40,1.0,0,0
1,openai_4_1_mini_temp_0_5,0.789088,0.310722,0.4628,1108.55,40,1.0,0,0
2,azure_o3_mini,0.728575,0.269763,0.41377,1204.55,40,1.0,0,0
