# Extraction

Following: https://python.langchain.com/v0.2/docs/tutorials/extraction/

In [7]:
from typing import Optional

from langchain_core.pydantic_v1 import BaseModel, Field

from enum import Enum

class Unit(Enum):
    "Known units for height."
    METERS = 'm'
    CENTIMETERS = 'cm'
    FEET = 'ft'
    INCHES = 'in'


class Height(BaseModel):
    """Height in numerical value and unit"""
    value: float = Field(description="The numerical value of the height")
    unit: str = Field(..., description="The unit of the height", enum=["ft", "m"])


class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height: Optional[Height] = Field(
        default=None, description="Information about the height if given by the user"
    )

In [20]:
unit_values = [unit.value for unit in Unit]
print(unit_values)

['m', 'cm', 'ft', 'in']


In [22]:
from langchain_core.prompts import ChatPromptTemplate

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value."
            "Respect the following schema for the height unitand parse the users input when necessary: {schema}".format(schema=unit_values),
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

In [23]:
from langchain_experimental.llms.ollama_functions import OllamaFunctions

In [24]:
llm = OllamaFunctions(model="llama3", format="json")
runnable = prompt | llm.with_structured_output(schema=Person)

In [25]:
text = "Alan Smith is 6 feet tall and has blond hair."
runnable.invoke({"text": text})

Person(name='Alan Smith', hair_color='blond', height=Height(value=1.83, unit='m'))