# Chat Extraction

This benchmark combines classification, summarization, and extraction in one a combined task. The model is
expected to output formatted json in the expected schema.

In [1]:
import getpass
import os
import uuid

uid = uuid.uuid4().hex[:4]  # Avoid conflicts in project names

# Get your API key from https://smith.langchain.com/settings
api_keys = [
    "LANGCHAIN_API_KEY"
]
for key in api_keys:
    if key not in os.environ:
        os.environ[key] = "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"

In [2]:
from langchain_benchmarks import clone_public_dataset, registry

task = registry["Chat Extraction"]

# Clone the dataset to your tenant
clone_public_dataset(task.dataset_id, dataset_name=task.name)


task

Dataset Chat Extraction already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/11aec5e3-a2f8-4942-b52a-2727dd9c10a0/datasets/1f457916-d48b-4989-af10-33ea06a3c8de.


0,1
Name,Chat Extraction
Type,ExtractionTask
Dataset ID,00f4444c-9460-4a82-b87a-f50096f1cfef
Description,A dataset meant to test the ability of an LLM to extract and infer structured information from a dialogue. The dialogue is between a user and a support engineer. Outputs should be structured as a JSON object and test both the ability of the LLM to correctly structure the information and its ability to perform simple classification tasks.


#### Schema

Each extraction task has an expected output schema defined in a Pydantic BaseModel object, which we can use to
get a JSON schema object.

In [3]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch
# from typing import Optional, Dict, Any, List
# import json
# from langchain.llms.base import LLM
# from langchain_core.callbacks.manager import CallbackManagerForLLMRun
# from pydantic import Field, BaseModel
# from nu_extract_run import NuExtract

# class LocalChatModel(LLM):
#     tokenizer: Any = Field(exclude=True)
#     model: Any = Field(exclude=True)
#     model_name: str = Field(default="numind/NuExtract-1.5")
#     temperature: float = Field(default=0.0)
#     device: str = Field(default_factory=lambda: "cuda" if torch.cuda.is_available() else "cpu")
#     bound_functions: List[Dict[str, Any]] = Field(default_factory=list)
#     function_call: Optional[str] = Field(default=None)
#     nuextract = NuExtract()
    
#     def __init__(self, model_name: str = "numind/NuExtract-1.5", temperature: float = 0.0, device: str = None, pydantic_example = None, **kwargs):
#         device = device or ("cuda" if torch.cuda.is_available() else "cpu")
#         tokenizer = AutoTokenizer.from_pretrained(model_name)
#         model: Any = Field(exclude=True)
#         # self.nuextract = NuExtract()
#         self.pydantic_example = pydantic_example
#         super().__init__(model_name=model_name, temperature=temperature, device=device, tokenizer=tokenizer, model=model, **kwargs)

#     def bind_functions(self, functions: List[Dict[str, Any]], function_call: Optional[str] = None) -> LLM:
#         new_instance = self.copy()
#         new_instance.bound_functions = functions
#         new_instance.function_call = function_call
#         return new_instance

#     def _call(self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any) -> str:
#         prompt = f"{prompt}\nOutput only valid JSON"
#         # inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

#         with torch.no_grad():
#             output = self.nuextract.extract_json(self.pydantic_example, prompt)
        
#         print(output)

#         meow

#         # return "{}"

#     @property
#     def _llm_type(self) -> str:
#         return "nuextract"

#     @property
#     def _identifying_params(self) -> Dict[str, Any]:
#         return {"model_name": self.model_name, "temperature": self.temperature, "device": self.device}

In [4]:
# class CarModel(BaseModel):
#     Name: str
#     Manufacturer: str
#     Designers: list
#     Number_of_units_produced: int

# class DataModel(BaseModel):
#     Car: CarModel
# car_obj = DataModel(Car=CarModel(Name="", Manufacturer="", Designers=[], Number_of_units_produced=0))

# model = LocalChatModel()

# model("this si prompt")

## Define an extraction chain

Let's build the extraction chain that we can use to get structured information from the emails.

In [5]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from transformers import pipeline
from nu_extract_run import NuExtract

def format_run(dialogue_input: dict):
    question = dialogue_input["question"]
    answer = dialogue_input["answer"]
    return {
        "dialogue": f"<question>\n{question}\n</question>\n"
        f"<assistant-response>\n{answer}\n</assistant-response>"
    }

llm = NuExtract()

output_parser = JsonOutputFunctionsParser()
extraction_chain = (
    format_run
    | task.instructions
    | llm
    | output_parser
    | (lambda x: {"output": x})
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Now it's time to measure our chain's effectiveness!

## Evaluate

Let's evaluate the chain now.

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
from langsmith.client import Client
from nu_extract_run import NuExtract

from langchain_benchmarks.extraction.tasks.chat_extraction import get_eval_config

client = Client()

eval_config = get_eval_config()

test_run = client.run_on_dataset(
    dataset_name=task.name,
    llm_or_chain_factory=extraction_chain,
    # llm_or_chain_factory=llm,
    evaluation=eval_config,
    verbose=True,
    project_name=f"numind-NuExtract-1.5-{uid}",
    project_metadata={
        "arch": "openai-functions",
        "model": "local-NuExtract-1.5",
    },
)

View the evaluation results for project 'numind-NuExtract-1.5-bc96' at:
https://smith.langchain.com/o/11aec5e3-a2f8-4942-b52a-2727dd9c10a0/datasets/1f457916-d48b-4989-af10-33ea06a3c8de/compare?selectedSessions=b891375a-1546-4710-b72f-42b71046487b

View all tests for Dataset Chat Extraction at:
https://smith.langchain.com/o/11aec5e3-a2f8-4942-b52a-2727dd9c10a0/datasets/1f457916-d48b-4989-af10-33ea06a3c8de
[>                                                 ] 0/27

Chain failed for example 82a97a89-3480-4143-aaea-a158e2c1a642 with inputs {'answer': 'To use cache in ChatOpenAI in TypeScript, you can follow these steps:\n\n1. Import the necessary dependencies:\n```typescript\nimport { ChatOpenAI, RedisCache } from \'langchain\';\n```\n\n2. Create an instance of the RedisCache class and pass it as the value for the `cache` parameter in the ChatOpenAI constructor:\n```typescript\nconst cache = new RedisCache(upstashRedisCache); // Replace `upstashRedisCache` with your Redis cache configuration\nconst model = new ChatOpenAI({\n  modelName: "gpt-4-1106-preview",\n  streaming: true,\n  cache: cache,\n  callbacks: [\n    {\n      handleLLMNewToken(token) {\n        doesToken = true;\n        res.write(token);\n      },\n    },\n  ],\n});\n```\n\nMake sure to replace `upstashRedisCache` with your Redis cache configuration.\n\nBy using the RedisCache class from the `langchain` library, you can enable caching in ChatOpenAI and improve the performance of you

meow
System: You are a helpdesk assistant responsible with extracting information and generating tickets. Dialogues are between a user and a support engineer.
Human: Generate a ticket for the following question-response pair:
<Dialogue>
<question>
const model = new ChatOpenAI({
      modelName: "gpt-4-1106-preview",
      streaming: true,
      cache: upstashRedisCache,
      callbacks: [
        {
          handleLLMNewToken(token) {
            doesToken = true
            res.write(token)
          },
        },
      ],
    })
how to use cache in chatopenai in typescript?
</question>
<assistant-response>
To use cache in ChatOpenAI in TypeScript, you can follow these steps:

1. Import the necessary dependencies:
```typescript
import { ChatOpenAI, RedisCache } from 'langchain';
```

2. Create an instance of the RedisCache class and pass it as the value for the `cache` parameter in the ChatOpenAI constructor:
```typescript
const cache = new RedisCache(upstashRedisCache); // Replace `u

Chain failed for example 4e631a5f-d21f-4508-bacf-24ca555f11fa with inputs {'answer': 'Yes, it is possible to use function calls with Llama models. Llama provides different wrappers and APIs that allow you to interact with the models using function calls. Here are a few examples:\n\n1. LlamaCpp: You can use the LlamaCpp wrapper to make function calls to Llama models. This wrapper provides a simple way to use Llama models by passing in a string and getting a string completion as the output [2].\n\n2. LlamaAPI: Llama also provides an API called LlamaAPI, which is a hosted version of Llama2 that supports function calling. You can use the LlamaAPI Python package to interact with Llama models by creating an instance of the LlamaAPI class and making function calls using the `invoke` method [1].\n\nThese wrappers and APIs allow you to easily integrate Llama models into your code and make function calls to generate responses or perform other tasks.', 'question': 'Is it possible to use function 

meow
System: You are a helpdesk assistant responsible with extracting information and generating tickets. Dialogues are between a user and a support engineer.
Human: Generate a ticket for the following question-response pair:
<Dialogue>
<question>
Is it possible to use function call with llama models
</question>
<assistant-response>
Yes, it is possible to use function calls with Llama models. Llama provides different wrappers and APIs that allow you to interact with the models using function calls. Here are a few examples:

1. LlamaCpp: You can use the LlamaCpp wrapper to make function calls to Llama models. This wrapper provides a simple way to use Llama models by passing in a string and getting a string completion as the output [2].

2. LlamaAPI: Llama also provides an API called LlamaAPI, which is a hosted version of Llama2 that supports function calling. You can use the LlamaAPI Python package to interact with Llama models by creating an instance of the LlamaAPI class and making fu

Chain failed for example e276069a-8ce7-43c5-93ba-e2c4801f66ae with inputs {'answer': 'To import the `langchain.dataframes` module, you can use the following code:\n\n```python\nfrom langchain.dataframes import *\n```\n\nThis will import all the classes and functions from the `langchain.dataframes` module, allowing you to use them in your code.', 'question': 'How do I import the module langchain.dataframes?'}
Error Type: ValidationError, Message: 1 validation error for Generation
text
  str type expected (type=type_error.str)


meow
System: You are a helpdesk assistant responsible with extracting information and generating tickets. Dialogues are between a user and a support engineer.
Human: Generate a ticket for the following question-response pair:
<Dialogue>
<question>
How do I import the module langchain.dataframes?
</question>
<assistant-response>
To import the `langchain.dataframes` module, you can use the following code:

```python
from langchain.dataframes import *
```

This will import all the classes and functions from the `langchain.dataframes` module, allowing you to use them in your code.
</assistant-response>
</Dialogue>
meow
[----->                                            ] 3/27

Chain failed for example 2ffc455c-fdfa-4f13-9438-b1cb0849f548 with inputs {'answer': 'Para crear un agente que pueda utilizar una base de datos vectorial existente y un documento en un directorio como fuente de información, puedes seguir estos pasos:\n\n1. Crear la base de datos vectorial:\n   - Utiliza la clase `Chroma` del módulo `langchain.vectorstores` para crear la base de datos vectorial.\n   - Carga los documentos en la base de datos utilizando un cargador de documentos, como `TextLoader` o `WebBaseLoader`.\n   - Divide los documentos en fragmentos más pequeños utilizando `CharacterTextSplitter`.\n   - Crea los vectores de los fragmentos utilizando un modelo de embeddings, como `OpenAIEmbeddings`.\n   - Crea la base de datos vectorial utilizando el método `from_documents` de la clase `Chroma`.\n\n2. Crear el agente:\n   - Importa las clases y funciones necesarias, como `initialize_agent`, `Tool`, `AgentType`, `LLMMathChain`, etc.\n   - Define las herramientas del agente utilizan

meow
System: You are a helpdesk assistant responsible with extracting information and generating tickets. Dialogues are between a user and a support engineer.
Human: Generate a ticket for the following question-response pair:
<Dialogue>
<question>
quiero crear un agente que sea capaz de usar una base de datos vectorial ya creada para buscar informacion, ademas que pueda usar un documento en un directorio como fuente de informacion para solucionar su proposito
</question>
<assistant-response>
Para crear un agente que pueda utilizar una base de datos vectorial existente y un documento en un directorio como fuente de información, puedes seguir estos pasos:

1. Crear la base de datos vectorial:
   - Utiliza la clase `Chroma` del módulo `langchain.vectorstores` para crear la base de datos vectorial.
   - Carga los documentos en la base de datos utilizando un cargador de documentos, como `TextLoader` o `WebBaseLoader`.
   - Divide los documentos en fragmentos más pequeños utilizando `Charact

Chain failed for example 21975f6c-c828-4c5c-83b1-af1e2b52d52e with inputs {'answer': 'To extract the logical argument from a text using OpenAI LLM (Large Language Model), there are several ways you can approach it. Here are some techniques you can use:\n\n1. **Causal Modeling**: Causal modeling involves analyzing the cause-and-effect relationships within the text to identify the logical argument. By understanding the logical flow of the text and identifying the premises and conclusions, you can extract the logical argument.\n\n2. **Robustness Testing**: Robustness testing involves subjecting the text to various scenarios and evaluating the consistency and validity of the logical argument. This can help identify any weaknesses or fallacies in the argument.\n\n3. **Bias Mitigation**: Bias mitigation techniques can be used to ensure that the logical argument is not influenced by any biased or unfair reasoning. By addressing any biases in the text, you can extract a more objective and logi

meow
System: You are a helpdesk assistant responsible with extracting information and generating tickets. Dialogues are between a user and a support engineer.
Human: Generate a ticket for the following question-response pair:
<Dialogue>
<question>
I want to use openai LLM to extract the logical argument from a text. What are the ways to do it and what is the best way?
</question>
<assistant-response>
To extract the logical argument from a text using OpenAI LLM (Large Language Model), there are several ways you can approach it. Here are some techniques you can use:

1. **Causal Modeling**: Causal modeling involves analyzing the cause-and-effect relationships within the text to identify the logical argument. By understanding the logical flow of the text and identifying the premises and conclusions, you can extract the logical argument.

2. **Robustness Testing**: Robustness testing involves subjecting the text to various scenarios and evaluating the consistency and validity of the logica

KeyboardInterrupt: 

Chain failed for example 320f621f-fab5-412e-9154-e5ad827aba3b with inputs {'answer': 'To run Llama2 using pandas, you can follow these steps:\n\n1. Install the necessary packages:\n   - Install the `langchain` library by running `pip install langchain`.\n   - Install the `pandas` library by running `pip install pandas`.\n\n2. Import the required modules:\n   - Import the `Ollama` class from the `langchain.llms` module.\n   - Import the `pandas` module.\n\n3. Load your data into a pandas DataFrame:\n   - Use the pandas library to load your data into a DataFrame. You can use functions like `read_csv()` or `read_excel()` depending on the format of your data.\n\n4. Create an instance of the `Ollama` class:\n   - Initialize an instance of the `Ollama` class, specifying the desired model. For example, you can use `llm = Ollama(model="llama2")`.\n\n5. Generate predictions:\n   - Use the `llm` instance to generate predictions on your data. You can pass your data as input to the `llm()` method.

[---------->                                       ] 6/27

Chain failed for example c1ab4fd9-a571-4bc1-bd4c-a3a45433f782 with inputs {'answer': "To perform the task of creating an app that interacts with the user, converts user requests into structured descriptions (JSON), determines the action type, selects the appropriate agent or tool, and generates Python code based on API documentation, you can follow these steps using Langchain concepts:\n\n1. Set up the necessary components:\n   - Define the tools: Identify the tools you need to interact with the user and perform the required actions. For example, you might need tools for user input, JSON manipulation, and API documentation retrieval.\n   - Create the agent: Initialize an agent that can handle the user's requests, parse the inputs, and determine the appropriate actions to take based on the structured description.\n\n2. Implement the chat functionality:\n   - Use a chat model to interact with the user: You can use a chat model from Langchain, such as the `ChatOpenAI` model, to handle the

[------------>                                     ] 7/27

Chain failed for example 28520bbe-6272-4c84-b79f-d31ac55c4574 with inputs {'answer': 'Pour joindre les deux outputs, vous pouvez utiliser la fonction `zip` pour combiner les éléments correspondants des deux listes, puis utiliser une boucle `for` pour les concaténer en une seule chaîne de caractères. Voici un exemple de code qui illustre cette approche :\n\n```python\nheader_label = header.iloc[:, 1:].apply(lambda x: \'/ \'.join(x.dropna().astype(str).unique()), axis=1)\nheader_list = ": ".join([header_filter, header_label])\n\noutput = ""\nfor label, value in zip(header_label, header_list):\n    output += f"{label}: {value}\\n"\n\nprint(output)\n```\n\nCela devrait vous donner une sortie qui combine les éléments des deux listes dans un format spécifié.', 'question': 'je travail sur python. je souhaite joindre ces deux outputs : \n\n        header_label = header.iloc[:, 1:].apply(lambda x: \'/ \'.join(x.dropna().astype(str).unique()), axis=1)\n        print(header_label)\n        header

[-------------->                                   ] 8/27

Chain failed for example 7fba0db4-97b9-4bf3-b09f-84f41f0827e6 with inputs {'answer': 'To use a RecursiveUrlLoader to load content from a page, you need to follow these steps:\n\n1. Import the RecursiveUrlLoader class from the langchain.document_loaders.recursive_url_loader module.\n2. Create an instance of the RecursiveUrlLoader, passing the target URL as the `url` parameter.\n3. Optionally, you can customize the loader by specifying additional parameters such as `max_depth`, `exclude_dirs`, `use_async`, `extractor`, and `timeout`.\n4. Call the `load()` method on the loader instance to load the web pages.\n5. The `load()` method will return a list of Document objects, each representing a loaded web page.\n6. You can access the content of each Document object using the `page_content` attribute.\n\nHere is an example of how to use the RecursiveUrlLoader:\n\n```python\nfrom langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader\n\nurl = "https://docs.python.org/3.9/"\nl

[---------------->                                 ] 9/27

## Compare to Claude-2

Let's compare our results to Anthropic's Claude-2. We will mimic the function calling interface.

In [None]:
from typing import Any, Dict, Type

from langchain.chat_models import ChatAnthropic
from langchain.output_parsers.xml import XMLOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel

claude_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a data extraction bot tasked with extracting and inferring information from dialogues and generating tickets. Always respond "
            "only with XML based on the following JSON schema:\n{schema}",
        ),
        (
            "user",
            "Generate a ticket from the following question-response pair:\n"
            "<Dialogue>\n{dialogue}\n</Dialogue>\n"
            "Remember, respond directly with this format:\n"
            "<{function_call}>\n...\n</{function_call}>"
            "RESPOND ONLY IN XML THEN STOP.",
        ),
    ]
)
prompt = claude_prompt.partial(
    schema=task.schema.schema_json(), function_call=task.schema.schema()["title"]
)

claude = ChatAnthropic(model="claude-2", temperature=0, max_tokens_to_sample=2048)


class MergeSchema:
    """Merge the XML Output Parser schema into the output."""

    def __init__(self, schema: Type[BaseModel]):
        self.schema = schema

    @property
    def _func_name(self) -> str:
        return self.schema.__name__

    def _merge_schema(self, parsed_output: Any, schema: Type[BaseModel]):
        merged_output = {}
        if isinstance(parsed_output, dict):
            items = parsed_output.items()
        elif isinstance(parsed_output, list):
            items = [(k, v) for item in parsed_output for k, v in item.items()]
        else:
            return parsed_output

        for key, value in items:
            if key in schema.__fields__:
                field_info = schema.__fields__[key]
                if isinstance(value, list):
                    if issubclass(field_info.type_, (BaseModel, dict)):
                        result = self._merge_schema(value, field_info.type_)
                    elif all(
                        isinstance(item, dict) and item.keys() == {"item"}
                        for item in value
                    ):
                        result = [next(iter(item.values())) for item in value]
                    else:
                        result = value
                else:
                    result = value
            else:
                result = value
            if key in merged_output:
                if isinstance(merged_output[key], list):
                    merged_output[key].append(result)
                else:
                    merged_output[key] = [merged_output[key], result]
            else:
                merged_output[key] = result

        return merged_output

    def __call__(self, parsed_output: dict) -> Dict[str, Any]:
        merged_output = {}
        if self._func_name not in parsed_output:
            return parsed_output
        return {
            self._func_name: self._merge_schema(
                parsed_output[self._func_name], self.schema
            )
        }


def try_parse(llm_output, config):
    try:
        output_chain = XMLOutputParser() | MergeSchema(task.schema)
        parsed = output_chain.invoke(llm_output, config)
        # Wrap as 'output' so to be unified for the evaluators
        return {"output": parsed.get("GenerateTicket")}
    except Exception as e:
        return {"output": llm_output, "error": str(e)}


claude_extraction_chain = format_run | prompt | claude | try_parse

In [None]:
result = claude_extraction_chain.invoke(
    {"question": "how do i run llama 2 locally?", "answer": "Llama.cpp of course."}
)
result

In [None]:
claude_test_run = client.run_on_dataset(
    dataset_name=task.name,
    llm_or_chain_factory=claude_extraction_chain,
    evaluation=eval_config,
    verbose=True,
    project_name=f"claude-2-json-schema-to-xml-{uid}",
    project_metadata={
        "arch": "claude-json-schema-xml-output",
    },
)

So it looks like edit distance is pretty good, but the schema validation leaves something to be desired.

We're defining the schema in JSON then requesting XML. Let's try keeping it unified.

## Try with XSD Schema Definition

In this variant, let's see if Claude performs better if we keep our structure consistent.

In [None]:
from typing import Any, Dict, Type

from langchain.chat_models import ChatAnthropic
from langchain.output_parsers.xml import XMLOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel

# This is the schema the model will populate
xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">

    <xs:simpleType name="QuestionCategory">
        <xs:restriction base="xs:string">
            <xs:enumeration value="Implementation Issues"/>
            <xs:enumeration value="Feature Requests"/>
            <xs:enumeration value="Concept Explanations"/>
            <xs:enumeration value="Code Optimization"/>
            <xs:enumeration value="Security and Privacy Concerns"/>
            <xs:enumeration value="Model Training and Fine-tuning"/>
            <xs:enumeration value="Data Handling and Manipulation"/>
            <xs:enumeration value="User Interaction Flow"/>
            <xs:enumeration value="Technical Integration"/>
            <xs:enumeration value="Error Handling and Logging"/>
            <xs:enumeration value="Customization and Configuration"/>
            <xs:enumeration value="External API and Data Source Integration"/>
            <xs:enumeration value="Language and Localization"/>
            <xs:enumeration value="Streaming and Real-time Processing"/>
            <xs:enumeration value="Tool Development"/>
            <xs:enumeration value="Function Calling"/>
            <xs:enumeration value="LLM Integrations"/>
            <xs:enumeration value="General Agent Questions"/>
            <xs:enumeration value="General Chit Chat"/>
            <xs:enumeration value="Memory"/>
            <xs:enumeration value="Debugging Help"/>
            <xs:enumeration value="Application Design"/>
            <xs:enumeration value="Prompt Templates"/>
            <xs:enumeration value="Cost Tracking"/>
            <xs:enumeration value="Other"/>
        </xs:restriction>
    </xs:simpleType>

    <xs:simpleType name="Sentiment">
        <xs:restriction base="xs:string">
            <xs:enumeration value="Negative"/>
            <xs:enumeration value="Neutral"/>
            <xs:enumeration value="Positive"/>
        </xs:restriction>
    </xs:simpleType>

    <xs:simpleType name="ProgrammingLanguage">
        <xs:restriction base="xs:string">
            <xs:enumeration value="python"/>
            <xs:enumeration value="javascript"/>
            <xs:enumeration value="typescript"/>
            <xs:enumeration value="unknown"/>
            <xs:enumeration value="other"/>
        </xs:restriction>
    </xs:simpleType>

    <xs:complexType name="QuestionCategorization">
        <xs:sequence>
            <xs:element name="question_category" type="QuestionCategory"/>
            <xs:element name="category_if_other" type="xs:string" minOccurs="0"/>
            <xs:element name="is_off_topic" type="xs:boolean"/>
            <xs:element name="toxicity" type="xs:int">
                <xs:minInclusive value="0"/>
                <xs:maxInclusive value="5"/>
            </xs:element>
            <xs:element name="sentiment" type="Sentiment"/>
            <xs:element name="programming_language" type="ProgrammingLanguage"/>
        </xs:sequence>
    </xs:complexType>

    <xs:simpleType name="ResponseType">
        <xs:restriction base="xs:string">
            <xs:enumeration value="resolve issue"/>
            <xs:enumeration value="provide guidance"/>
            <xs:enumeration value="request information"/>
            <xs:enumeration value="give up"/>
            <xs:enumeration value="none"/>
            <xs:enumeration value="other"/>
        </xs:restriction>
    </xs:simpleType>

    <xs:complexType name="ResponseCategorization">
        <xs:sequence>
            <xs:element name="response_type" type="ResponseType"/>
            <xs:element name="response_type_if_other" type="xs:string" minOccurs="0"/>
            <xs:element name="confidence_level" type="xs:int">
                <xs:minInclusive value="0"/>
                <xs:maxInclusive value="5"/>
            </xs:element>
            <xs:element name="followup_actions" type="xs:string" minOccurs="0" maxOccurs="unbounded"/>
        </xs:sequence>
    </xs:complexType>

    <xs:complexType name="GenerateTicket">
        <xs:sequence>
            <xs:element name="issue_summary" type="xs:string"/>
            <xs:element name="question" type="QuestionCategorization"/>
            <xs:element name="response" type="ResponseCategorization"/>
        </xs:sequence>
    </xs:complexType>

</xs:schema>"""

prompt = claude_prompt.partial(schema=xsd, function_call=task.schema.schema()["title"])

claude_extraction_chain = format_run | prompt | claude | try_parse

In [None]:
result = claude_extraction_chain.invoke(
    {
        "question": "how do i run llama 2 locally?",
        "answer": "Llama.cpp of course. Afterwords remember to install it, then add it to your path!",
    }
)
result

In [None]:
claude_xsd_test_run = client.run_on_dataset(
    dataset_name=task.name,
    llm_or_chain_factory=claude_extraction_chain,
    evaluation=eval_config,
    verbose=True,
    project_name=f"claude-2-xsd-to-xml-{uid}",
    project_metadata={
        "arch": "claude-xml",
    },
)

The json schema metric went down, meaning that the output counter-intuitively is less friendly to our parser than before.


Let's try with an open source model: `llama-v2-34b-code-instruct`.

## Try with Llama 2

`llama-v2-34b-code-instruct` is an open source model that is meant to be good at both code-gen and other tasks.
Let's benchmark it.

In [None]:
import json

from langchain.chat_models import ChatFireworks
from langchain.output_parsers.json import parse_json_markdown

llama_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a data extraction bot tasked with extracting and inferring information from dialogues and generating tickets. Always respond only with json based on the following JSON schema:\n{schema}",
        ),
        (
            "user",
            "Generate a ticket from the following question-response pair:\n"
            "<Dialogue>\n{dialogue}\n</Dialogue>\n"
            "Remember, respond directly with this format:\n"
            '{{"{function_call}": ...}}\n'
            "RESPOND ONLY IN JSON THEN STOP.",
        ),
    ]
)

prompt = llama_prompt.partial(
    schema=task.schema.schema_json(), function_call=task.schema.schema()["title"]
)

llm = ChatFireworks(
    model="accounts/fireworks/models/llama-v2-34b-code-instruct",
    temperature=0,
    model_kwargs={"max_tokens": 4000},
)


def parse_output(ai_message):
    content = ai_message.content
    parser = lambda x: json.loads(x, strict=False)
    try:
        parsed = parse_json_markdown(content, parser=parser)
        if "GenerateTicket" in parsed:
            return {"output": parsed["GenerateTicket"]}
        return {"output": parsed}
    except json.JSONDecodeError:
        return {"output": content}


fireworks_extraction_chain = format_run | prompt | llm | parse_output

In [None]:
result = fireworks_extraction_chain.invoke(
    {"question": "how do i run llama 2 locally?", "answer": "Llama.cpp of course."}
)
result

In [None]:
llama_v2_test_run = client.run_on_dataset(
    dataset_name=task.name,
    llm_or_chain_factory=fireworks_extraction_chain,
    evaluation=eval_config,
    verbose=True,
    project_name=f"llama-v2-34b-code-instruct-{uid}",
    project_metadata={"arch": "claude-xml", "model": "llama-v2-34b-code-instruct"},
)

## Compare Results

Here, we'll take a look at the underlying results a little bit. You can review the results to see relative performance in aggregate and on a per-example basis.

In [None]:
df = (
    test_run.to_dataframe()
    .join(claude_test_run.to_dataframe(), rsuffix="_claude")
    .join(claude_xsd_test_run.to_dataframe(), rsuffix="_claude_xsd")
    .join(llama_v2_test_run.to_dataframe(), rsuffix="_llama_v2")
)

In [None]:
df.head(5)

#### Here, we compare the aggregate metrics side-by-side

In [None]:
df = (
    test_run.get_aggregate_feedback()
    .add_suffix(".gpt-4")
    .join(claude_test_run.get_aggregate_feedback(), rsuffix=".claude")
    .join(claude_xsd_test_run.get_aggregate_feedback(), rsuffix=".claude_xsd")
    .join(llama_v2_test_run.get_aggregate_feedback(), rsuffix=".llama_v2")
)

In [None]:
from IPython.display import HTML, display

feedback_columns = sorted(
    {col.rsplit(".", 1)[0] for col in df.columns if col.startswith("feedback.")}
)


def render_metric(df, metric):
    sub_cols = [col for col in df.columns if col.startswith(metric)]
    display(HTML(f"<h3>{metric.split('.')[-1]}</h3>"))
    display(df[sub_cols][df.index.isin(["mean", "std"])])

In [None]:
feedback_columns

In [None]:
render_metric(df, "execution_time")

In [None]:
for metric in feedback_columns:
    render_metric(df, metric)

## Next Steps

Try it out yourself! You can see some additional experiments on Open Source models in [this repo](https://github.com/hinthornw/llama-extraction).

In [None]:
from transformers import pipeline

model_name = "numind/NuExtract-1.5"
summarizer = pipeline("text-generation", model=model_name)




# Define a function to run the evaluation
def evaluate_model(data, summarizer):
    results = []
    for index, row in data.iterrows():
        if index < 10:
            dialogue = f"<question>\n{row['question']}\n</question>\n<assistant-response>\n{row['answer']}\n</assistant-response>"
            summary = summarizer(dialogue, max_length=150, min_length=30, do_sample=False)
            results.append(summary[0]['summary_text'])
    return results

# Run the evaluation
evaluation_results = evaluate_model(df, summarizer)
evaluation_results

In [None]:
evaluation_results