## Extraction

Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [3]:
import os
import dotenv

dotenv_path = dotenv.find_dotenv()
dotenv.load_dotenv(dotenv_path)

True

In [4]:
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

In [5]:
from typing import Optional
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

In [6]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [7]:
convert_pydantic_to_openai_function(Information)

  warn_deprecated(


{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'$defs': {'Person': {'description': 'Information about a person.',
    'properties': {'name': {'description': "person's name", 'type': 'string'},
     'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
      'description': "person's age"}},
    'required': ['name', 'age'],
    'type': 'object'}},
  'properties': {'people': {'description': 'List of info about people',
    'items': {'description': 'Information about a person.',
     'properties': {'name': {'description': "person's name", 'type': 'string'},
      'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
       'description': "person's age"}},
     'required': ['name', 'age'],
     'type': 'object'},
    'type': 'array'}},
  'required': ['people'],
  'type': 'object'}}

In [9]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
# setting temperature to 0 makes it deterministic (no randomness)
model = ChatOpenAI(temperature=0)
extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

  warn_deprecated(


In [10]:
extraction_model.invoke("Joe is 30, his mom is Martha")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people":[{"name":"Joe","age":30},{"name":"Martha","age":null}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 95, 'total_tokens': 116}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_3bc1b5746c', 'finish_reason': 'stop', 'logprobs': None})

In [11]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

In [12]:
extraction_chain = prompt | extraction_model
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people":[{"name":"Joe","age":30},{"name":"Martha","age":null}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 112, 'total_tokens': 133}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_3bc1b5746c', 'finish_reason': 'stop', 'logprobs': None})

In [14]:
# use an output parser to extract the sentiment and language, and print them
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

{'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]}

In [15]:
# use the KeyOutputFunctionsParser to extract the people key
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

[{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]