In [None]:
!pip install git+https://github.com/ibm-granite-community/utils \
    "langchain_community<0.3.0" \
    replicate

In [None]:
!pip install mlflow
!pip install replicate
!pip install diskcache
!pip install langchain-community ibm-granite-community
!pip install textstat
!pip install --upgrade pandas numpy

In [3]:
import mlflow
import pandas as pd
from ibm_granite_community.notebook_utils import get_env_var
from langchain_community.llms import Replicate
from typing import Dict, List, Any, Optional
import json
import re

# Initialize Product inventory database
PRODUCTS = [
    {"id": "P010", "name": "Wireless Headphones", "category": "Electronics", "description": "High-quality wireless headphones with noise cancellation", "price": 149.99, "stock": 75, "delivery": {"shipping_cost": 4.99, "free_shipping_threshold": 100, "estimated_days": "2-3"}},
    {"id": "P022", "name": "Running Shoes", "category": "Sports", "description": "Lightweight running shoes with cushioned soles", "price": 89.99, "stock": 120, "delivery": {"shipping_cost": 5.99, "free_shipping_threshold": 100, "estimated_days": "3-5"}},
    {"id": "P035", "name": "Coffee Maker", "category": "Kitchen", "description": "Programmable coffee maker with 12-cup capacity", "price": 79.99, "stock": 45, "delivery": {"shipping_cost": 7.99, "free_shipping_threshold": 100, "estimated_days": "2-4"}},
    {"id": "P012", "name": "Laptop Stand", "category": "Electronics", "description": "Adjustable aluminum laptop stand with cooling", "price": 29.99, "stock": 150, "delivery": {"shipping_cost": 3.99, "free_shipping_threshold": 50, "estimated_days": "1-2"}},
    {"id": "P007", "name": "Yoga Mat", "category": "Sports", "description": "Non-slip yoga mat with carrying strap", "price": 24.99, "stock": 95, "delivery": {"shipping_cost": 4.99, "free_shipping_threshold": 50, "estimated_days": "2-3"}},
    {"id": "P045", "name": "Wireless Mouse", "category": "Electronics", "description": "Ergonomic Wireless Mouse with Rechargable Battery", "price": 49.99, "stock": 18, "delivery": {"shipping_cost": 4.99, "free_shipping_threshold": 50, "estimated_days": "2-3"}},
    {"id": "P050", "name": "Smart Watch", "category": "Electronics", "description": "Fitness tracking smart watch with heart rate monitor", "price": 199.99, "stock": 60, "delivery": {"shipping_cost": 6.99, "free_shipping_threshold": 150, "estimated_days": "2-4"}},
    {"id": "P060", "name": "Bluetooth Speaker", "category": "Electronics", "description": "Portable Bluetooth speaker with deep bass", "price": 59.99, "stock": 80, "delivery": {"shipping_cost": 4.99, "free_shipping_threshold": 100, "estimated_days": "2-3"}},
    {"id": "P070", "name": "Electric Kettle", "category": "Kitchen", "description": "1.7L electric kettle with auto shut-off", "price": 39.99, "stock": 110, "delivery": {"shipping_cost": 5.99, "free_shipping_threshold": 80, "estimated_days": "2-3"}},
    {"id": "P080", "name": "Backpack", "category": "Accessories", "description": "Water-resistant backpack with laptop compartment", "price": 54.99, "stock": 130, "delivery": {"shipping_cost": 3.99, "free_shipping_threshold": 75, "estimated_days": "2-4"}},
    {"id": "P090", "name": "Desk Lamp", "category": "Home", "description": "LED desk lamp with adjustable brightness", "price": 34.99, "stock": 90, "delivery": {"shipping_cost": 2.99, "free_shipping_threshold": 50, "estimated_days": "1-2"}},
    {"id": "P100", "name": "Gaming Keyboard", "category": "Electronics", "description": "Mechanical gaming keyboard with RGB lighting", "price": 89.99, "stock": 40, "delivery": {"shipping_cost": 5.99, "free_shipping_threshold": 100, "estimated_days": "2-3"}},
    {"id": "P110", "name": "Water Bottle", "category": "Sports", "description": "Insulated stainless steel water bottle", "price": 19.99, "stock": 200, "delivery": {"shipping_cost": 2.99, "free_shipping_threshold": 40, "estimated_days": "1-2"}},
    {"id": "P120", "name": "Sunglasses", "category": "Accessories", "description": "UV-protection polarized sunglasses", "price": 29.99, "stock": 85, "delivery": {"shipping_cost": 3.99, "free_shipping_threshold": 60, "estimated_days": "2-3"}},
    {"id": "P130", "name": "Fitness Tracker", "category": "Electronics", "description": "Activity and sleep tracking wristband", "price": 69.99, "stock": 55, "delivery": {"shipping_cost": 4.99, "free_shipping_threshold": 100, "estimated_days": "2-3"}},
    {"id": "P140", "name": "Portable Charger", "category": "Electronics", "description": "10000mAh portable power bank", "price": 39.99, "stock": 140, "delivery": {"shipping_cost": 3.99, "free_shipping_threshold": 60, "estimated_days": "2-3"}},
]

In [18]:
# Tool registry and functions
TOOL_REGISTRY = {}

def tool(func):
    """Decorator to register a function as a tool."""
    TOOL_REGISTRY[func.__name__] = func
    return func

def _find_product(product_id: str) -> Optional[Dict]:
    for product in PRODUCTS:
        if product['id'] == product_id:
            return product
    return None

def _search_products(query: str) -> List[Dict]:
    """Search products by name, description, or category with improved matching"""
    query_lower = query.lower()
    matches = []

    # Split query into individual words for better matching
    query_words = [word.strip() for word in query_lower.split() if len(word.strip()) > 2]

    for product in PRODUCTS:
        # Create searchable text from product
        searchable_text = (
            f"{product['name']} {product['description']} {product['category']}"
        ).lower()

        # Check if any query words match
        match_score = 0
        for word in query_words:
            if word in searchable_text:
                match_score += 1

        # If at least half the words match, consider it a match
        if match_score >= max(1, len(query_words) // 2):
            matches.append((product, match_score))

    # Sort by match score (highest first) and return products
    matches.sort(key=lambda x: x[1], reverse=True)
    return [match[0] for match in matches]

@tool
def get_product_info(product_id: str) -> str:
    """Get product description and category."""
    product = _find_product(product_id)
    if not product:
        return "Product not found."
    return f"Product: {product['name']}\nCategory: {product['category']}\nDescription: {product['description']}"

@tool
def get_price(product_id: str) -> str:
    """Get product price information."""
    product = _find_product(product_id)
    if not product:
        return "Product not found."
    return f"Price: ${product['price']:.2f}"

@tool
def get_delivery_info(product_id: str) -> str:
    """Get delivery details."""
    product = _find_product(product_id)
    if not product:
        return "Product not found."
    delivery = product['delivery']
    free_shipping_msg = f"\nFree shipping on orders over ${delivery['free_shipping_threshold']:.2f}"
    return (f"Shipping Cost: ${delivery['shipping_cost']:.2f}\n"
            f"Estimated Delivery: {delivery['estimated_days']} business days"
            f"{free_shipping_msg}")

@tool
def check_stock(product_id: str) -> str:
    """Check stock availability."""
    product = _find_product(product_id)
    if not product:
        return "Product not found."
    stock = product['stock']
    if stock > 100:
        status = "High Stock"
    elif stock > 50:
        status = "Good Stock"
    elif stock > 10:
        status = "Limited Stock"
    else:
        status = "Low Stock"
    return f"Stock Status: {status} ({stock} units available)"

@tool
def search_products(query: str) -> str:
    """Search for products by name, description, or category."""
    matches = _search_products(query)
    if not matches:
        return "No products found matching your search."

    result = "Found the following products:\n"
    for product in matches[:3]:  # Limit to top 3 results
        result += f"- {product['name']} (ID: {product['id']}): {product['description']}\n"
    return result

In [None]:
# Initialize the agent with tools
tools = list(TOOL_REGISTRY.values())
tool_names = [tool.__name__ for tool in TOOL_REGISTRY.values()]
agent = tools

print(f"Agent initialized successfully with following tool calling capabilities: \n {tool_names}")

In [None]:
def evaluate_tool_calling_step1(user_query: str) -> Dict:
    """
    STEP 1: Evaluate tool calling for product search query
    """
    print("="*80)
    print("STEP 1: EVALUATING TOOL CALLING FOR PRODUCT SEARCH")
    print("="*80)
    print(f"Query: {user_query}")

    # Process the query
    user_query_lc = user_query.lower()
    tools_used = []

    # Check if search is triggered
    if any(keyword in user_query_lc for keyword in ["looking for", "ergonomic", "wireless", "mouse"]):
        # Execute search
        search_result = search_products(user_query)
        tools_used.append({
            "tool_name": "search_products",
            "input": user_query,
            "output": search_result
        })

        # If product found, get product info
        if "P045" in search_result:
            product_info = get_product_info("P045")
            tools_used.append({
                "tool_name": "get_product_info",
                "input": "P045",
                "output": product_info
            })

    # Display results
    print("\nTool Calling Results:")
    for i, tool in enumerate(tools_used, 1):
        print(f"    Tool Call {i}:")
        print(f"    Tool Name   : {tool['tool_name']}")
        print(f"    Input       : {tool['input']}")
        print(f"    Output      : {tool['output']}")
        print()

    # Evaluation
    success = len(tools_used) > 0 and any("P045" in tool['output'] for tool in tools_used)
    print("Tool Calling Evaluation:")
    print(f"✅ Search tool called: {'Yes' if any(t['tool_name'] == 'search_products' for t in tools_used) else 'No'}")
    print(f"✅ Product found: {'Yes' if success else 'No'}")
    print(f"✅ Correct product (P045): {'Yes' if any('P045' in t['output'] for t in tools_used) else 'No'}")

    return {"tools_used": tools_used, "success": success}

def evaluate_tool_calling_step2(followup_query: str, context: str = "P045") -> Dict:
    """
    STEP 2: Evaluate tool calling for stock availability query
    """
    print("="*80)
    print("STEP 2: EVALUATING TOOL CALLING FOR STOCK QUERY")
    print("="*80)
    print(f"Follow-up Query: {followup_query}")
    print(f"Product Context: {context}")

    # Process the follow-up query
    followup_lc = followup_query.lower()
    tools_used = []

    # Check if stock query
    if "stock" in followup_lc or "available" in followup_lc:
        # Execute stock check
        stock_result = check_stock(context)
        tools_used.append({
            "tool_name": "check_stock",
            "input": context,
            "output": stock_result
        })

        # Also get product info for context
        product_info = get_product_info(context)
        tools_used.append({
            "tool_name": "get_product_info",
            "input": context,
            "output": product_info
        })

    # Display results
    print("\nTool Calling Results:")
    for i, tool in enumerate(tools_used, 1):
        print(f"    Tool Call {i}:")
        print(f"    Tool Name   : {tool['tool_name']}")
        print(f"    Input       : {tool['input']}")
        print(f"    Output      : {tool['output']}")
        print()

    # Evaluation
    success = len(tools_used) > 0 and any("Stock Status" in tool['output'] for tool in tools_used)
    print("Stock Query Evaluation:")
    print(f"✅ Stock tool called: {'Yes' if any(t['tool_name'] == 'check_stock' for t in tools_used) else 'No'}")
    print(f"✅ Stock information provided: {'Yes' if success else 'No'}")
    print(f"✅ Correct product context: {'Yes' if any(context in t['input'] for t in tools_used) else 'No'}")

    return {"tools_used": tools_used, "success": success}

# MLflow Model for Final Evaluation
class ReplicateToolCallingModel(mlflow.pyfunc.PythonModel):
    def __init__(self, model_name, api_token, model_kwargs):
        self.model_name = model_name
        self.api_token = api_token
        self.model_kwargs = model_kwargs
        self.model = None

    def load_context(self, context):
        """Load the model when needed"""
        from langchain_community.llms import Replicate
        self.model = Replicate(
            model=self.model_name,
            replicate_api_token=self.api_token,
            model_kwargs=self.model_kwargs,
        )

    def predict(self, context, model_input):
        """Predict method that combines model response with tool calling"""
        if isinstance(model_input, pd.DataFrame):
            if 'inputs' in model_input.columns:
                prompts = model_input['inputs'].tolist()
            else:
                prompts = model_input.iloc[:, 0].tolist()
        elif isinstance(model_input, list):
            prompts = model_input
        else:
            prompts = [str(model_input)]

        predictions = []
        for prompt in prompts:
            try:
                # Use tool calling for product-related queries
                if any(keyword in prompt.lower() for keyword in ["ergonomic", "wireless", "mouse", "stock", "available"]):
                    # Get product info
                    if "stock" in prompt.lower() or "available" in prompt.lower():
                        result = check_stock("P045") + ". " + get_product_info("P045")
                    else:
                        search_result = search_products(prompt)
                        if "P045" in search_result:
                            result = get_product_info("P045")
                        else:
                            result = search_result
                    predictions.append(result)
                else:
                    # Fallback to original model response
                    prediction = self.model.invoke(prompt)
                    predictions.append(prediction)

            except Exception as e:
                print(f"Error generating response for prompt '{prompt}': {e}")
                predictions.append(f"Error: {str(e)}")

        return predictions

class FinalEvaluationFormatter:
    """Format final MLflow evaluation results"""

    def __init__(self, results, evaluation_data):
        self.results = results
        self.evaluation_data = evaluation_data

    def evaluate_final_responses(self) -> str:
        """
        STEP 3: Final Agent Response Evaluation
        """
        print("="*80)
        print("STEP 3: FINAL AGENT RESPONSE EVALUATION")
        print("="*80)

        # Extract predictions
        try:
            predictions_df = self.results.tables['eval_results_table']
            if hasattr(predictions_df, 'to_pandas'):
                predictions_df = predictions_df.to_pandas()

            if 'outputs' in predictions_df.columns:
                predictions_list = predictions_df['outputs'].tolist()
            else:
                predictions_list = predictions_df.iloc[:, 0].tolist()

        except Exception as e:
            print(f"Warning: Could not extract predictions: {e}")
            return "Could not evaluate responses"

        formatted_output = []
        total_score = 0

        for idx, row in self.evaluation_data.iterrows():
            query = row['inputs']
            target = row['targets']
            prediction = str(predictions_list[idx]) if idx < len(predictions_list) else "No prediction"

            # Calculate score
            score = self._calculate_score(prediction, target)
            total_score += score

            # Format result
            result_text = f"Query {idx + 1}: {query}\n"
            result_text += f"Expected : {target}\n"
            result_text += f"Predicted: {prediction}\n"
            result_text += f"Score    : {score}/5\n"
            result_text += f"Feedback : {self._get_feedback(prediction, target)}\n"
            result_text += "-" * 60

            formatted_output.append(result_text)

        # Summary
        avg_score = total_score / len(self.evaluation_data)
        summary = f"\nFINAL EVALUATION SUMMARY:\n"
        summary += f"Average Score: {avg_score:.2f}/5\n"
        summary += f"Total Queries: {len(self.evaluation_data)}\n"

        # MLflow metrics
        metrics = {k: v for k, v in self.results.metrics.items()
                  if v is not None and not (isinstance(v, float) and pd.isna(v))}
        summary += f"MLflow Metrics: {json.dumps(metrics, indent=2)}\n"

        return "\n".join(formatted_output) + summary

    def _calculate_score(self, predicted: str, target: str) -> int:
        """Calculate evaluation score"""
        predicted_lower = predicted.lower()
        target_lower = target.lower()

        score = 0

        # Relevance check
        if any(word in predicted_lower for word in target_lower.split()):
            score += 2

        # Completeness check
        if len(predicted) >= len(target) * 0.5:
            score += 1

        # Accuracy check
        target_keywords = set(target_lower.split())
        predicted_keywords = set(predicted_lower.split())
        overlap = len(target_keywords.intersection(predicted_keywords))

        if overlap >= len(target_keywords) * 0.4:
            score += 2

        return min(5, max(1, score))

    def _get_feedback(self, predicted: str, target: str) -> str:
        """Generate feedback"""
        score = self._calculate_score(predicted, target)
        if score >= 4:
            return "Excellent response with accurate product information"
        elif score >= 3:
            return "Good response, mostly accurate"
        else:
            return "Needs improvement in accuracy and completeness"

In [None]:
"""Run the complete lab exercise in steps"""

print("MLflow Agent Evaluation Lab Exercise")
print("="*80)

# Step 1: Tool calling evaluation for product search
query1 = "I'm looking for an ergonomic wireless mouse"
step1_result = evaluate_tool_calling_step1(query1)

In [None]:
# Step 2: Tool calling evaluation for stock query
query2 = "Is the product available in stock?"
step2_result = evaluate_tool_calling_step2(query2, "P045")

In [None]:
# Step 3: Full MLflow evaluation
print("="*80)
print("STEP 3: RUNNING FULL MLFLOW EVALUATION")
print("="*80)

# Set up MLflow
mlflow.set_experiment("agent_evaluation_lab")

# Create evaluation data
response_eval = pd.DataFrame({
    "inputs": [
        "I'm looking for an ergonomic wireless mouse",
        "Is the product available in stock?",
        "Do you have ergonomic wireless mice in stock?"
    ],
    "targets": [
        "Product P045 is a wireless mouse with ergonomic support and programmable buttons",
        "Product P045 is currently available in your selected region",
        "Yes, Product P045 is available. It's an ergonomic wireless mouse with programmable buttons."
    ]
})

# Initialize and log the model
pyfunc_model = ReplicateToolCallingModel(
    model_name="ibm-granite/granite-3.3-8b-instruct",
    api_token=get_env_var('REPLICATE_API_TOKEN'),
    model_kwargs={"max_tokens": 1024, "temperature": 0.2}
)

# Log the model to MLflow
with mlflow.start_run() as run:
    mlflow.pyfunc.log_model(
        artifact_path="agent_model",
        python_model=pyfunc_model,
        pip_requirements=[
            "langchain-community",
            "replicate"
        ]
    )
    model_uri = f"runs:/{run.info.run_id}/agent_model"

# Run evaluation
print("Running MLflow evaluation...")
results = mlflow.evaluate(
    model_uri,
    response_eval,
    targets="targets",
    model_type="question-answering",
    evaluators=["default"],
)

# Format and display final results
formatter = FinalEvaluationFormatter(results, response_eval)
final_results = formatter.evaluate_final_responses()
print(final_results)

print("\n" + "="*80)
print("Evaluation COMPLETE!")
print("="*80)