# LangChain Prompt Engineering Experiment

## Overview

This notebook demonstrates the LangChain-based symptom diagnosis classification approach:

- **Framework**: LangChain with LCEL (LangChain Expression Language)
- **Approach**: Zero-shot prompt engineering (no training required)
- **Model**: Uses prompt templates with structured output

## Key Differences from DSPy

Unlike DSPy which requires training/optimization:
- LangChain uses hardcoded prompts (prompt engineering)
- No optimizer selection needed
- "Tuning" validates the prompt on train/val sets
- Evaluation recreates the chain from hardcoded prompts

## Experiment Configuration

- **Project**: 9-langchain
- **Experiment**: initial-pipeline
- **LLM Model**: ollama/qwen3:0.6b
- **Dataset**: 50 train examples, 20 validation examples

## MLflow Tracking

Experiments are logged to MLflow with:
- Metrics (train/validation accuracy)
- Parameters (model config, prompt details)
- Artifacts (prediction samples)
- Model registration in MLflow registry

In [None]:
"""Import required modules for LangChain tuning experiments."""
import pandas as pd
from symptom_diagnosis_explorer.commands.classify.tune import (
    TuneCommand,
    TuneRequest,
)
from symptom_diagnosis_explorer.commands.classify.evaluate import (
    EvaluateCommand,
    EvaluateRequest,
)
from symptom_diagnosis_explorer.models.model_development import FrameworkType

print("Imports successful")

In [None]:
"""Configuration for LangChain experiment."""

# Experiment configuration
PROJECT = "9-langchain"
EXPERIMENT_NAME = "initial-pipeline"
FULL_EXPERIMENT_NAME = f"/symptom-diagnosis-explorer/{PROJECT}/{EXPERIMENT_NAME}"

# MLflow configuration
MLFLOW_TRACKING_URI = "http://localhost:5001"

# Model configuration
LM_MODEL = "ollama/qwen3:0.6b"
MODEL_NAME = "symptom-classifier-langchain"

# Dataset configuration
TRAIN_SIZE = 15
VAL_SIZE = 20
TEST_SIZE = 10

print("Configuration set")
print(f"Experiment: {FULL_EXPERIMENT_NAME}")
print(f"MLflow URI: {MLFLOW_TRACKING_URI}")
print(f"LLM: {LM_MODEL}")
print(f"Dataset: {TRAIN_SIZE} train, {VAL_SIZE} validation, {TEST_SIZE} test")
print(f"Framework: LangChain (prompt engineering, no training required)")

In [None]:
"""Run LangChain prompt engineering experiment."""

print("=" * 80)
print("EXPERIMENT: LangChain Prompt Engineering")
print("=" * 80)

# Create request with LangChain framework configuration
tune_request = TuneRequest(
    framework=FrameworkType.LANGCHAIN,
    train_size=TRAIN_SIZE,
    val_size=VAL_SIZE,
    model_name=MODEL_NAME,
    experiment_name=FULL_EXPERIMENT_NAME,
    experiment_project=PROJECT,
    lm_model=LM_MODEL,
    mlflow_tracking_uri=MLFLOW_TRACKING_URI,
)

# Execute tuning (validates prompt on train/val sets)
print("\nStarting LangChain prompt validation...")
print("Note: LangChain doesn't require training - this validates the prompt.")
tune_command = TuneCommand(tune_request)
tune_response = tune_command.execute()

print("\nLangChain prompt validation complete!")

In [None]:
"""Display LangChain tuning results."""

print("\nLANGCHAIN PROMPT ENGINEERING RESULTS")
print("-" * 80)

# Display metrics
print("\nMetrics:")
print(f"  Train Accuracy:      {tune_response.metrics.train_accuracy:.4f}")
print(f"  Validation Accuracy: {tune_response.metrics.validation_accuracy:.4f}")
print(f"  Train Examples:      {tune_response.metrics.num_train_examples}")
print(f"  Validation Examples: {tune_response.metrics.num_val_examples}")

# Display model info
print("\nModel Registry:")
print(f"  Name:    {tune_response.model_info.name}")
print(f"  Version: {tune_response.model_info.version}")
print(f"  Run ID:  {tune_response.run_id}")

print("\nNote: LangChain models use hardcoded prompts, not learned parameters.")
print("The model registration stores metadata and allows version tracking.")

In [None]:
"""Evaluate LangChain model on test set."""

print("\n" + "=" * 80)
print("TEST SET EVALUATION")
print("=" * 80)

print(f"\nEvaluating LangChain model: {MODEL_NAME}")
print(f"  Validation Accuracy: {tune_response.metrics.validation_accuracy:.4f}")

# Evaluate on test examples
print(f"\nRunning evaluation on first {TEST_SIZE} test examples...")

# Create evaluation request
eval_request = EvaluateRequest(
    framework=FrameworkType.LANGCHAIN,
    model_name=MODEL_NAME,
    model_version=None,  # Use latest version
    split="test",
    eval_size=TEST_SIZE,
    experiment_name=FULL_EXPERIMENT_NAME,
    experiment_project=PROJECT,
    mlflow_tracking_uri=MLFLOW_TRACKING_URI,
)

# Execute evaluation
eval_command = EvaluateCommand()
eval_response = eval_command.execute(eval_request)

# Display results
print("\n" + "-" * 80)
print("TEST SET RESULTS")
print("-" * 80)
print("\nMetrics:")
print(f"  Test Accuracy:  {eval_response.accuracy:.4f}")
print(f"  Test Examples:  {eval_response.num_examples}")
print(f"  Split:          {eval_response.split}")
print(f"  Run ID:         {eval_response.run_id}")

print("\n" + "=" * 80)
print("Evaluation complete! Check MLflow for detailed prediction artifacts.")
print("=" * 80)