In [1]:
!pip install smolagents
!pip install duckduckgo-search
!pip install evidently
!pip install huggingface_hub
!pip install litellm

Collecting smolagents
  Downloading smolagents-1.20.0-py3-none-any.whl.metadata (16 kB)
Collecting python-dotenv (from smolagents)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading smolagents-1.20.0-py3-none-any.whl (145 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.4/145.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, smolagents
Successfully installed python-dotenv-1.1.1 smolagents-1.20.0
Collecting duckduckgo-search
  Downloading duckduckgo_search-8.1.1-py3-none-any.whl.metadata (16 kB)
Collecting primp>=0.15.0 (from duckduckgo-search)
  Downloading primp-0.15.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading duckduckgo_search-8.1.1-py3-none-any.whl (18 kB)
Downloading primp-0.15.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [34]:
!pip install litellm

Collecting litellm
  Downloading litellm-1.74.9.post1-py3-none-any.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading litellm-1.74.9.post1-py3-none-any.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: litellm
Successfully installed litellm-1.74.9.post1


In [23]:
# @title All the imports
from smolagents import Tool,  DuckDuckGoSearchTool, InferenceClientModel, ToolCallingAgent
from duckduckgo_search import DDGS  # For web search
import json
import pandas as pd
from evidently.descriptors import TextLength, Sentiment, SemanticSimilarity, SentenceCount
from evidently import Dataset, DataDefinition
import os
import pandas as pd
from evidently.llm.templates import MulticlassClassificationPromptTemplate
from evidently import Dataset, DataDefinition
from evidently.descriptors import LLMEval


from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# @title fact checker tool



# Define the custom tool
class FactCheckTool(Tool):
    name = "web_fact_checker"
    description = "Use this tool to verify a list of factual claims using web search and return an evaluation matrix."
    inputs = {
        "input": {
            "type": "string",
            "description": "A JSON list of factual claims.",
        }
    }
    output_type = "string"

    def _search_web(self, query):
        with DDGS() as ddgs:
            results = list(ddgs.text(query, max_results=5))
        return results

    def _evaluate_fact(self, fact):
        search_results = self._search_web(fact)
        context_snippets = [res["body"] for res in search_results if "body" in res]

        if not context_snippets:
            return {"fact": fact, "status": "Unverifiable", "evidence": "No relevant results"}

        combined_context = " ".join(context_snippets).lower()
        fact_lower = fact.lower()

        if fact_lower in combined_context:
            return {"fact": fact, "status": "Likely True", "evidence": context_snippets[:2]}
        elif any(keyword in combined_context for keyword in fact_lower.split()[:3]):
            return {"fact": fact, "status": "Partially True", "evidence": context_snippets[:2]}
        else:
            return {"fact": fact, "status": "Likely False", "evidence": context_snippets[:2]}

    def forward(self, input):
        try:
            facts = json.loads(input)
            assert isinstance(facts, list)
        except:
            return "❌ Invalid input. Please provide a JSON list of factual claims."

        evaluation_matrix = [self._evaluate_fact(fact) for fact in facts]
        return json.dumps(evaluation_matrix, indent=2)



In [9]:
# @title Evidently Statistic Response Evaluator Tool


class EvidentlyResponseEvaluatorTool(Tool):
    """
    SmolAgent-compatible tool for evaluating prompt/response pairs using standard rubrics.

    .name: Tool identifier for agent discovery.
    .description: Short explanation of what the tool does.
    .run(): Accepts a dictionary with 'prompt' and 'response' keys and returns rubric scores.
    """
    name = "evidently_response_evaluator"
    description = (
        "Evaluates a prompt and response using multiple metrics: length, sentence count, "
        "sentiment, relevance, and hallucination (semantic similarity). "
        "Input: {'prompt': <prompt str>, 'response': <response str>}. "
        "Output: dict of scores for each metric."
    )
    inputs = {
        "prompt": {
            "type": "string",
            "description": "The prompt that was given to the model.",
        },
        "response": {
            "type": "string",
            "description": "The response from the model.",
        }
    }
    output_type = "object"

    def __init__(self):
        super().__init__()
        self.descriptors = [
            TextLength("answer", alias="Length"),
            SentenceCount("answer", alias="Sentence Count"),
            Sentiment("answer", alias="Sentiment Score"),
            SemanticSimilarity(columns=["answer", "question"], alias="Relevance Score"),
            SemanticSimilarity(columns=["answer", "question"], alias="Hallucination Score")
        ]
        self.data_definition = DataDefinition(text_columns=["question", "answer"])

    def forward(self, prompt: str, response: str):
        """
        Arguments:
            inputs (dict): Must contain 'prompt' and 'response' keys with string values.
        Returns:
            dict: Metric names mapped to their corresponding scores.
        """
        if not (isinstance(prompt, str) and isinstance(response, str)):
            raise ValueError("Both 'prompt' and 'response' must be provided as strings.")
        eval_df = pd.DataFrame([{"question": prompt, "answer": response}])
        eval_dataset = Dataset.from_pandas(
            eval_df,
            data_definition=self.data_definition
        )
        eval_dataset.add_descriptors(self.descriptors)
        result_df = eval_dataset.as_dataframe()
        scores = {desc.alias: result_df[desc.alias].iloc[0] for desc in self.descriptors}
        return scores

In [47]:
# @title pedagogy evaluator judge panel tool
class LLMResponsePedagogyEvaluator(Tool):
    name = "llm_pedagogy_evaluator"
    description = (
        "Evaluates an LLM response using pedagogical rubrics: "
        "progression from simple to complex, critical thinking, term clarity, and effectiveness of examples. "
        "Uses Gemini LLM as backend model. Input: {'question', 'answer', 'gemini_api_key'}. Output: scores."
    )
    inputs = {
        "question": {"type": "string", "description": "Prompt given to model."},
        "answer": {"type": "string", "description": "Response from model."},
        "gemini_api_key": {"type": "string", "description": "Google Gemini API Key."}
    }
    output_type = "object"

    def __init__(self):
        super().__init__()
        self._setup_templates()

    def _setup_templates(self):
        self.simple_to_complex_template = MulticlassClassificationPromptTemplate(
            pre_messages=[("system", "You are an expert in pedagogy. Your task is to evaluate how well a response's structure progresses from simple to complex.")],
            criteria="Analyze if the response effectively teaches by progressing from foundational principles to complex topics, building upon previous points.",
            category_criteria={
                "5": "The response perfectly scaffolds the explanation, starting with fundamental concepts and logically building to complex ideas.",
                "4": "The response generally moves from basic to complex but may have minor logical gaps.",
                "3": "The response lacks a clear pedagogical structure, introducing complex topics without a proper foundation.",
                "2": "The response begins with advanced concepts, failing to establish a basic foundation.",
                "1": "The response explains basic concepts well but fails to progress to more complex topics."
            },
            uncertainty="unknown",
            include_reasoning=False,
            include_scores=False
        )
        self.critical_thinking_template = MulticlassClassificationPromptTemplate(
            pre_messages=[("system", "You are an expert in educational psychology. Your task is to evaluate if a response encourages active learning and critical thinking.")],
            criteria="Analyze if the response prompts the user to question, analyze, and apply information, going beyond passive information delivery.",
            category_criteria={
                "5": "The response actively stimulates critical thinking with thought-provoking, open-ended questions or problem-solving scenarios.",
                "4": "The response includes some questions or prompts for reflection, but they may be superficial.",
                "3": "The response is purely informational, providing information passively.",
                "2": "The response is overly simplistic or provides closed answers that inhibit further inquiry.",
                "1": "The response consists only of questions without providing foundational information."
            },
            uncertainty="unknown",
            include_reasoning=False,
            include_scores=False
        )
        self.term_clarity_template = MulticlassClassificationPromptTemplate(
            pre_messages=[("system", "You are an expert in technical writing. Your task is to evaluate if the response defines key terms before using them.")],
            criteria="Analyze if the response clearly defines key terminology before or as it is used, avoiding jargon to ensure the content is accessible.",
            category_criteria={
                "5": "The response proactively identifies and clearly defines all key terms before they are used in a complex context.",
                "4": "The response defines most key terms but might miss a few or provide slightly unclear definitions.",
                "3": "The response uses specialized terminology extensively without providing any definitions.",
                "2": "The response attempts to define terms, but the definitions are confusing or inaccurate.",
                "1": "The response avoids using any specialized terms that require a definition."
            },
            uncertainty="unknown",
            include_reasoning=False,
            include_scores=False
        )
        self.example_correctness_template = MulticlassClassificationPromptTemplate(
            pre_messages=[("system", "You are an expert in instructional design. Your task is to evaluate the effectiveness of examples used in an explanation.")],
            criteria="Analyze the quality and relevance of examples used to help a user understand a complex topic. Effective examples should be clear, relatable, and illustrative.",
            category_criteria={
                "5": "The response uses highly effective examples that are clear, relevant, and make abstract concepts easy to understand.",
                "4": "The response includes examples that are generally helpful but might be slightly unclear.",
                "3": "The examples provided are confusing, incorrect, or more complex than the topic they are supposed to clarify.",
                "2": "The response explains a complex topic but does not use any examples.",
                "1": "The response claims to provide an example but only restates the definition."
            },
            uncertainty="unknown",
            include_reasoning=False,
            include_scores=False
        )

    # The key: implement forward(self, question, answer, gemini_api_key) exactly matching inputs keys!
    def forward(self, question, answer, gemini_api_key):
        if not (isinstance(question, str) and isinstance(answer, str) and isinstance(gemini_api_key, str) and gemini_api_key.strip()):
            raise ValueError("Inputs must include 'question', 'answer', and a valid 'gemini_api_key'.")

        os.environ["GEMINI_API_KEY"] = gemini_api_key

        eval_df = pd.DataFrame([[question, answer]], columns=["question", "answer"])
        data_definition = DataDefinition(text_columns=["question", "answer"])
        eval_dataset = Dataset.from_pandas(eval_df, data_definition=data_definition)

        descriptors = [
            LLMEval(
                column_name="answer",
                template=self.simple_to_complex_template,
                model="gemini-1.5-flash",
                provider="gemini",
                alias="Progression: Simple to Complex"
            ),
            LLMEval(
                column_name="answer",
                template=self.critical_thinking_template,
                model="gemini-1.5-flash",
                provider="gemini",
                alias="Promotes Critical Thinking"
            ),
            LLMEval(
                column_name="answer",
                template=self.term_clarity_template,
                model="gemini-1.5-flash",
                provider="gemini",
                alias="Terms Clearly Defined"
            ),
            LLMEval(
                column_name="answer",
                template=self.example_correctness_template,
                model="gemini-1.5-flash",
                provider="gemini",
                alias="Effectiveness of Examples"
            ),
        ]
        eval_dataset.add_descriptors(descriptors)
        result_df = eval_dataset.as_dataframe()

        output = {
            "Progression: Simple to Complex": result_df["Progression: Simple to Complex"].iloc[0],
            "Promotes Critical Thinking": result_df["Promotes Critical Thinking"].iloc[0],
            "Terms Clearly Defined": result_df["Terms Clearly Defined"].iloc[0],
            "Effectiveness of Examples": result_df["Effectiveness of Examples"].iloc[0]
        }
        return output

In [50]:
# @title Clarity And Concesseness Evaluator Judge Panel Tool
class clarityAndConcesseness(Tool):
    name = "llm_pedagogy_evaluator"
    description = (
        "Evaluates an LLM response using pedagogical rubrics: "
        "progression from simple to complex, critical thinking, term clarity, and effectiveness of examples. "
        "Uses Gemini LLM as backend model. Input: {'question', 'answer', 'gemini_api_key'}. Output: scores."
    )
    inputs = {
        "question": {"type": "string", "description": "Prompt given to model."},
        "answer": {"type": "string", "description": "Response from model."},
        "gemini_api_key": {"type": "string", "description": "Google Gemini API Key."}
    }
    output_type = "object"

    def __init__(self):
        super().__init__()
        self._setup_templates()

    def _setup_templates(self):
        # --- New Templates from Your Request ---
        self.core_message_template = MulticlassClassificationPromptTemplate(
            pre_messages=[("system", "You are an expert in communication. Your task is to evaluate how easily the core message of a response can be understood.")],
            criteria="Analyze how easily and quickly a user can understand the core message. It should be concise, clear, and not buried under jargon or rambling. ",
            category_criteria={
                "immediately_clear": "The core message is stated upfront and is exceptionally clear and concise. The user can grasp the main point in seconds.",
                "clear_after_reading": "The core message is clear, but the user needs to read most of the response to understand it.",
                "buried_message": "The core message is present but is buried under irrelevant details, requiring significant effort to find.",
                "unclear_or_ambiguous": "The response is unfocused or convoluted, making it difficult to determine the core message.",
                "no_core_message": "The response fails to deliver a core message."
            },
            uncertainty="unknown"
        )

        self.sentence_length_template = MulticlassClassificationPromptTemplate(
            pre_messages=[("system", "You are an expert editor focused on clarity. Your task is to evaluate sentence structure in a response.")],
            criteria="Analyze the sentence structure. Is it unnecessarily long or convoluted? The complexity should be appropriate for the topic. ",
            category_criteria={
                "concise_and_clear": "Sentences are well-constructed, direct, and easy to parse, enhancing readability.",
                "slightly_wordy": "Generally clear, but some sentences are longer than necessary or contain filler phrases.",
                "overly_long_sentences": "Sentences are grammatically correct but too long, making them difficult to follow.",
                "convoluted_structure": "Sentences use overly complex syntax or jargon that obscures the meaning.",
                "long_and_convoluted": "Sentences are both excessively long and confusingly structured."
            },
            uncertainty="unknown"
        )

        self.completeness_template = MulticlassClassificationPromptTemplate(
            pre_messages=[("system", "You are a detail-oriented analyst. Your task is to assess how thoroughly a response addresses all parts of a user's question.")],
            criteria="Evaluate if the response addresses all explicit and implicit parts of the user's question. ",
            category_criteria={
                "excellent_coverage": "The response comprehensively addresses every single part of the user's question.",
                "good_coverage": "The response addresses the main parts of the question but overlooks a minor detail.",
                "partial_coverage": "The response addresses some significant parts but ignores other significant components.",
                "narrow_focus": "The response focuses on only one aspect of the query, ignoring other parts.",
                "no_coverage": "The response fails to address any specific parts of the user's question."
            },
            uncertainty="unknown"
        )
        self.example_correctness_template = MulticlassClassificationPromptTemplate(
            pre_messages=[("system", "You are an expert in instructional design. Your task is to evaluate the effectiveness of examples used in an explanation.")],
            criteria="Analyze the quality and relevance of examples used to help a user understand a complex topic. Effective examples should be clear, relatable, and illustrative.",
            category_criteria={
                "5": "The response uses highly effective examples that are clear, relevant, and make abstract concepts easy to understand.",
                "4": "The response includes examples that are generally helpful but might be slightly unclear.",
                "3": "The examples provided are confusing, incorrect, or more complex than the topic they are supposed to clarify.",
                "2": "The response explains a complex topic but does not use any examples.",
                "1": "The response claims to provide an example but only restates the definition."
            },
            uncertainty="unknown",
            include_reasoning=False,
            include_scores=False
        )

    # The key: implement forward(self, question, answer, gemini_api_key) exactly matching inputs keys!
    def forward(self, question, answer, gemini_api_key):
        if not (isinstance(question, str) and isinstance(answer, str) and isinstance(gemini_api_key, str) and gemini_api_key.strip()):
            raise ValueError("Inputs must include 'question', 'answer', and a valid 'gemini_api_key'.")

        os.environ["GEMINI_API_KEY"] = gemini_api_key

        eval_df = pd.DataFrame([[question, answer]], columns=["question", "answer"])
        data_definition = DataDefinition(text_columns=["question", "answer"])
        eval_dataset = Dataset.from_pandas(eval_df, data_definition=data_definition)

        descriptors = [
            LLMEval(column_name="answer", template=self.core_message_template, model="gemini-1.5-flash", provider="gemini", alias="Core Message Clarity"),
            LLMEval(column_name="answer", template=self.sentence_length_template, model="gemini-1.5-flash", provider="gemini", alias="Sentence Conciseness"),
            LLMEval(column_name="answer", template=self.completeness_template, model="gemini-1.5-flash", provider="gemini", alias="Completeness of Answer")
        ]
        eval_dataset.add_descriptors(descriptors)
        result_df = eval_dataset.as_dataframe()

        output = {
            "Core Message Clarity": result_df["Core Message Clarity"].iloc[0],
            "Sentence Conciseness": result_df["Sentence Conciseness"].iloc[0],
            "Completeness of Answer": result_df["Completeness of Answer"].iloc[0]
        }
        return output




Structured tool call result:

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.



InternalServerError: litellm.InternalServerError: litellm.InternalServerError: VertexAIException - {
  "error": {
    "code": 503,
    "message": "The model is overloaded. Please try again later.",
    "status": "UNAVAILABLE"
  }
}


In [None]:
tool = clarityAndConcesseness()
clarityAndConcesseness_judges_panel = ToolCallingAgent(
    tools=[tool],
    model=InferenceClientModel(),
    name="Clarity_And_Concesseness_Judges_Panel_Agent"
)

# Inputs
input_question = """ You are an AI assistant designed to teach machine learning concepts. I am an engineering student with a basic understanding of mathematics..."""
model_response = """Excellent question! This is the perfect way to approach learning machine learning..."""
API_KEY = "put_your_gemini_key_here"

# Option 1: Structured tool call (always works)
inputs = {
    "question": input_question,
    "answer": model_response,
    "gemini_api_key": API_KEY
}
print("Structured tool call result:")
print(tool.forward(**inputs))

# Option 2: Natural language to agent (depends on agent+model capabilities)
prompt = (
    f"Evaluate this response: '{model_response}' to this prompt: '{input_question}' "
    f"using this API key: '{API_KEY}'. Return the output of the tool as a JSON object."
)

print("\nAgent natural language routing result:")
agent_result = clarityAndConcesseness.run(prompt)
print(agent_result)



In [46]:
tool = LLMResponsePedagogyEvaluator()
pedagogy_judges_panel = ToolCallingAgent(
    tools=[tool],
    model=InferenceClientModel(),
    name="pedagogy_judges_panel_agent"
)

# Inputs
input_question = """ You are an AI assistant designed to teach machine learning concepts. I am an engineering student with a basic understanding of mathematics..."""
model_response = """Excellent question! This is the perfect way to approach learning machine learning..."""
API_KEY = "put_your_gemini_key_here"

# Option 1: Structured tool call (always works)
inputs = {
    "question": input_question,
    "answer": model_response,
    "gemini_api_key": API_KEY
}
print("Structured tool call result:")
print(tool.forward(**inputs))

# Option 2: Natural language to agent (depends on agent+model capabilities)
prompt = (
    f"Evaluate this response: '{model_response}' to this prompt: '{input_question}' "
    f"using this API key: '{API_KEY}'. Return the output of the tool as a JSON object."
)

print("\nAgent natural language routing result:")
agent_result = pedagogy_judges_panel.run(prompt)
print(agent_result)



Structured tool call result:
{'Progression: Simple to Complex': 'UNKNOWN', 'Promotes Critical Thinking': '4', 'Terms Clearly Defined': 'UNKNOWN', 'Effectiveness of Examples': 'UNKNOWN'}

Agent natural language routing result:


{'Progression: Simple to Complex': 'UNKNOWN', 'Promotes Critical Thinking': '4', 'Terms Clearly Defined': '1', 'Effectiveness of Examples': 'UNKNOWN'}


In [22]:
# Define the agent
fact_checker_agent = ToolCallingAgent(
    tools=[FactCheckTool()],
    model=InferenceClientModel(),
    name="FactCheckerAgent",
    #additional_authorized_imports=["json"] # Authorize the json module
)

# 3. Run the agent
input_prompt = """Check the following facts:
[
  "The Eiffel Tower is in Berlin.",
  "Elon Musk owns Tesla.",
  "Water boils at 90 degrees Celsius.",
  "India launched Chandrayaan-3 in 2023."
]"""

response = fact_checker_agent.run(input_prompt)
print(response)

  with DDGS() as ddgs:


  with DDGS() as ddgs:


  with DDGS() as ddgs:


  with DDGS() as ddgs:






  with DDGS() as ddgs:


  with DDGS() as ddgs:




  with DDGS() as ddgs:


  with DDGS() as ddgs:
  with DDGS() as ddgs:
  with DDGS() as ddgs:




  with DDGS() as ddgs:
  with DDGS() as ddgs:
  with DDGS() as ddgs:
  with DDGS() as ddgs:


  with DDGS() as ddgs:


  with DDGS() as ddgs:


  with DDGS() as ddgs:


  with DDGS() as ddgs:




  with DDGS() as ddgs:
  with DDGS() as ddgs:
  with DDGS() as ddgs:
  with DDGS() as ddgs:


  with DDGS() as ddgs:
  with DDGS() as ddgs:
  with DDGS() as ddgs:
  with DDGS() as ddgs:


[False, True, False, False]


In [21]:

# Define the agent
Statictic_response_evalution = ToolCallingAgent(
    tools=[EvidentlyResponseEvaluatorTool()],
    model=InferenceClientModel(),
    name="Statictic_response_evalution",
    #additional_authorized_imports=["json"] # Authorize the json module
)

# The input prompt and response from your original code
input_question = """ You are an AI assistant designed to teach machine learning concepts. I am an engineering student with a basic understanding of mathematics..."""
model_response = """Excellent question! This is the perfect way to approach learning machine learning..."""


response = Statictic_response_evalution.run(f"Evaluate this response: '{model_response}' to this prompt: '{input_question}'. Return the output of the tool as a JSON object.")
print(response)

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


{"Length": 84, "Sentence Count": 1, "Sentiment Score": 0. 8268, "Relevance Score": 0. 68883896, "Hallucination Score": 0. 68883896}
