# Tagging and Extraction Using OpenAI functions

In [28]:
####.  Please use VirttualEnv: LCEL_extracting
####.  Please use VirttualEnv: LCEL_extracting
####.  Please use VirttualEnv: LCEL_extracting

import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) 
OPENAI_API_KEY=os.environ["OPENAI_API_KEY"]


In [29]:
!pip freeze | grep langchain

!pip freeze | grep openai

!pip freeze | grep pydantic


langchain==0.2.15
langchain-community==0.2.13
langchain-core==0.2.41
langchain-openai==0.1.23
langchain-text-splitters==0.2.4
langchain-openai==0.1.23
openai==1.47.0
openapi-schema-pydantic==1.2.4
pydantic==2.9.2
pydantic-settings==2.5.2
pydantic_core==2.23.4


## Extraction

Extraction is similar to tagging, but used for extracting multiple pieces of information.

## Doing it for real

We can apply tagging to a larger body of text.

For example, let's load this blog post and extract tag information from a sub-set of the text.

In [30]:
from langchain_openai import ChatOpenAI
from typing import List, Optional
from pydantic import BaseModel, Field
#from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain_core.utils.function_calling import convert_to_openai_function
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

model = ChatOpenAI(temperature=0)

In [31]:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

In [32]:
doc = documents[0]

In [33]:
page_content = doc.page_content[:10000]

In [34]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    langugae: str = Field(description="Provide the languge that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [35]:
from langchain.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

In [36]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

overview_tagging_function = [convert_to_openai_function(Overview)]
tagging_model = model.bind(functions=overview_tagging_function, function_call={"name":"Overview"})
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser(key_name="overview")
tagging_chain.invoke({"input": page_content})

{'summary': 'The article discusses building autonomous agents powered by LLM (large language model) with components like planning, memory, and tool use. It also covers techniques like task decomposition and self-reflection in agent systems.',
 'langugae': 'English',
 'keywords': 'LLM, autonomous agents, planning, memory, tool use, task decomposition, self-reflection'}

Now let's try to extract all papers mentioned in this article

In [37]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [38]:
paper_extraction_function = [convert_to_openai_function(Info)]
extraction_model = model.bind(functions=paper_extraction_function, function_call={"name":"Info"})

In [39]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [40]:
extraction_chain.invoke({"input": page_content})

[{'title': 'LLM Powered Autonomous Agents', 'author': 'Lilian Weng'}]

In [41]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [42]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [43]:
extraction_chain.invoke({"input": page_content})

[{'title': 'Chain of thought (CoT; Wei et al. 2022)', 'author': None},
 {'title': 'Tree of Thoughts (Yao et al. 2023)', 'author': None},
 {'title': 'LLM+P (Liu et al. 2023)', 'author': None},
 {'title': 'ReAct (Yao et al. 2023)', 'author': None},
 {'title': 'Reflexion (Shinn & Labash 2023)', 'author': None},
 {'title': 'Chain of Hindsight (CoH; Liu et al. 2023)', 'author': None},
 {'title': 'Algorithm Distillation (AD; Laskin et al. 2023)', 'author': None}]

In [44]:
extraction_chain.invoke({"input": "hi"})

[]

In [45]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [46]:
splits = text_splitter.split_text(doc.page_content)

In [47]:
len(splits)

15

In [48]:
from langchain.schema.runnable import RunnableLambda

In [49]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [50]:
chain = (
    RunnableLambda(lambda x: [{"input": doc} for doc in text_splitter.split_text(x)])
    | extraction_chain.map()
    | flatten
)

In [51]:
chain.invoke(doc.page_content)

[{'title': 'AutoGPT', 'author': None},
 {'title': 'GPT-Engineer', 'author': None},
 {'title': 'BabyAGI', 'author': None},
 {'title': 'Chain of thought', 'author': 'Wei et al. 2022'},
 {'title': 'Tree of Thoughts', 'author': 'Yao et al. 2023'},
 {'title': 'LLM+P', 'author': 'Liu et al. 2023'},
 {'title': 'ReAct', 'author': 'Yao et al. 2023'},
 {'title': 'Reflexion', 'author': 'Shinn & Labash 2023'},
 {'title': 'Chain of Hindsight', 'author': 'Liu et al. 2023'},
 {'title': 'Algorithm Distillation', 'author': 'Laskin et al. 2023'},
 {'title': 'Laskin et al. 2023', 'author': None},
 {'title': 'Miller 1956', 'author': None},
 {'title': 'Duan et al. 2017', 'author': None},
 {'title': 'Google Blog', 'author': None},
 {'title': 'MRKL (Karpas et al. 2022)', 'author': None},
 {'title': 'TALM (Tool Augmented Language Models; Parisi et al. 2022)',
  'author': None},
 {'title': 'Toolformer (Schick et al. 2023)', 'author': None},
 {'title': 'HuggingGPT (Shen et al. 2023)', 'author': None},
 {'title'