# Tagging and Extraction Using OpenAI functions

In [96]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [97]:
from typing import List
from pydantic import BaseModel, Field
from langchain_core.utils.function_calling import convert_to_openai_function

In [182]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be in full text. Ex: English, Vietnamese, etc.)")

In [183]:
convert_to_openai_function(Tagging)

{'name': 'Tagging',
 'description': 'Tag the piece of text with particular info.',
 'parameters': {'properties': {'sentiment': {'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'type': 'string'},
   'language': {'description': 'language of text (should be in full text. Ex: English, Vietnamese, etc.)',
    'type': 'string'}},
  'required': ['sentiment', 'language'],
  'type': 'object'}}

In [19]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_fireworks import ChatFireworks
from langchain_groq import ChatGroq


In [115]:
model = ChatGroq(model_name="llama3-groq-70b-8192-tool-use-preview", temperature=0)
# model = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [186]:
tagging_functions = [convert_to_openai_function(Tagging)]

In [20]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

In [188]:
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "Tagging"}
)

In [189]:
tagging_chain = prompt | model_with_functions

In [190]:
tagging_chain.invoke({"input": "I love langchain"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"language": "English", "sentiment": "pos"}', 'name': 'Tagging'}}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 286, 'total_tokens': 304, 'completion_time': 0.054920107, 'prompt_time': 0.021580864, 'queue_time': 0.012295988999999997, 'total_time': 0.076500971}, 'model_name': 'llama3-groq-70b-8192-tool-use-preview', 'system_fingerprint': 'fp_ee4b521143', 'finish_reason': 'function_call', 'logprobs': None}, id='run-fbad4417-76d6-454f-b7de-a950cc52811f-0', usage_metadata={'input_tokens': 286, 'output_tokens': 18, 'total_tokens': 304})

In [191]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"language": "Italian", "sentiment": "neg"}', 'name': 'Tagging'}}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 289, 'total_tokens': 307, 'completion_time': 0.055076577, 'prompt_time': 0.021939113, 'queue_time': 0.011883937, 'total_time': 0.07701569}, 'model_name': 'llama3-groq-70b-8192-tool-use-preview', 'system_fingerprint': 'fp_ee4b521143', 'finish_reason': 'function_call', 'logprobs': None}, id='run-4c0e37b4-2b6e-4d6c-a1c8-59f61b58f708-0', usage_metadata={'input_tokens': 289, 'output_tokens': 18, 'total_tokens': 307})

In [35]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

In [193]:
tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()

In [194]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

{'language': 'Italian', 'sentiment': 'neg'}

# Extraction

Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [36]:
from typing import Optional
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

In [8]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [9]:
convert_to_openai_function(Information)

{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'properties': {'people': {'description': 'List of info about people',
    'items': {'description': 'Information about a person.',
     'properties': {'name': {'description': "person's name", 'type': 'string'},
      'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
       'description': "person's age"}},
     'required': ['name', 'age'],
     'type': 'object'},
    'type': 'array'}},
  'required': ['people'],
  'type': 'object'}}

In [28]:
extraction_functions = [convert_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

In [248]:
extraction_model.invoke("Joe is 30, his mom is Martha")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people": [{"name": "Joe", "age": 30}, {"name": "Martha", "age": null}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 332, 'total_tokens': 365, 'completion_time': 0.103545872, 'prompt_time': 0.02424242, 'queue_time': 0.008692686999999998, 'total_time': 0.127788292}, 'model_name': 'llama3-groq-70b-8192-tool-use-preview', 'system_fingerprint': 'fp_ee4b521143', 'finish_reason': 'function_call', 'logprobs': None}, id='run-bf32d4e0-2fdf-4f6c-9746-49b52ea6fe5d-0', usage_metadata={'input_tokens': 332, 'output_tokens': 33, 'total_tokens': 365})

In [29]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

In [30]:
extraction_chain = prompt | extraction_model

In [251]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people": [{"name": "Joe", "age": 30}, {"name": "Martha", "age": null}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 348, 'total_tokens': 381, 'completion_time': 0.106709245, 'prompt_time': 0.041347017, 'queue_time': 0.0012432009999999993, 'total_time': 0.148056262}, 'model_name': 'llama3-groq-70b-8192-tool-use-preview', 'system_fingerprint': 'fp_ee4b521143', 'finish_reason': 'function_call', 'logprobs': None}, id='run-81f33595-b577-416b-8b90-0e30051ab34e-0', usage_metadata={'input_tokens': 348, 'output_tokens': 33, 'total_tokens': 381})

In [203]:
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [204]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

{'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]}

In [37]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

In [209]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

In [211]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

[{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]

# Doing it for real

We can apply tagging to a larger body of text.

For example, let's load this blog post and extract tag information from a sub-set of the text.

In [90]:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://www.promptingguide.ai/techniques/cot")
documents = loader.load()

In [91]:
doc = documents[0]

In [92]:
page_content = doc.page_content[:10000]

In [93]:
print(page_content[:1000])

Chain-of-Thought Prompting | Prompt Engineering Guide Prompt Engineering Guide🎓 Prompt Engineering Course🎓 Prompt Engineering CourseServicesServicesAboutAboutGitHubGitHub (opens in a new tab)DiscordDiscord (opens in a new tab)Prompt EngineeringIntroductionLLM SettingsBasics of PromptingPrompt ElementsGeneral Tips for Designing PromptsExamples of PromptsTechniquesZero-shot PromptingFew-shot PromptingChain-of-Thought PromptingMeta PromptingSelf-ConsistencyGenerate Knowledge PromptingPrompt ChainingTree of ThoughtsRetrieval Augmented GenerationAutomatic Reasoning and Tool-useAutomatic Prompt EngineerActive-PromptDirectional Stimulus PromptingProgram-Aided Language ModelsReActReflexionMultimodal CoTGraph PromptingGuidesOptimizing PromptsApplicationsFine-tuning GPT-4oFunction CallingContext Caching with LLMsGenerating DataGenerating Synthetic Dataset for RAGTackling Generated Datasets DiversityGenerating CodeGraduate Job Classification Case StudyPrompt FunctionPrompt HubClassificationSentim

In [123]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [124]:
# from langchain_groq import ChatGroq
# model = ChatGroq(model_name="llama3-groq-70b-8192-tool-use-preview", temperature=0)
overview_tagging_function = [
    convert_to_openai_function(Overview)
]
tagging_model = model.bind(
    functions=overview_tagging_function,
    function_call={"name":"Overview"}
)
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

open ai

In [100]:
tagging_chain.invoke({"input": page_content})

{'summary': 'The article discusses various techniques in prompt engineering, particularly focusing on Chain-of-Thought (CoT) prompting and its applications in reasoning tasks. It highlights the introduction of CoT prompting by Wei et al. (2022) and the concept of zero-shot CoT prompting by Kojima et al. (2022). Additionally, it mentions the work of Zhang et al. (2022) on Automatic Chain-of-Thought (Auto-CoT) prompting, which aims to automate the generation of reasoning chains for demonstrations.',
 'language': 'English',
 'keywords': 'Chain-of-Thought prompting, zero-shot CoT, Automatic Chain-of-Thought, Wei et al. (2022), Kojima et al. (2022), Zhang et al. (2022)'}

llama 70B

In [125]:
tagging_chain.invoke({"input": page_content})

{'keywords': 'Chain-of-Thought Prompting, Zero-shot COT Prompting, Automatic Chain-of-Thought, Auto-CoT, question clustering, demonstration sampling, reasoning chain, rationale, heuristics, language model, LLM, GPT-4, Wei et al. 2022, Zhang et al. 2022, DAIR.AI Academy, PROMPTING20, discount, students, suboptimal solutions, manual efforts, LLMs, reasoning chains, demonstrations, diversity, mistakes, clusters, representative question, heuristics, tokens, reasoning steps, rationale, simple, accurate, process, code, available, here',
 'language': 'English',
 'summary': 'This article discusses various techniques in prompt engineering, including Chain-of-Thought (CoT) Prompting, Zero-shot COT Prompting, and Automatic Chain-of-Thought (Auto-CoT). It also provides examples of prompts and techniques, such as question clustering and demonstration sampling, and discusses the importance of diversity and heuristics in generating reasoning chains. Additionally, it mentions the availability of code 

In [127]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [128]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an EMPTY LIST.

Do not make up or guess ANY extra information. Only extract what exactly is in the text.

if the input is irrelevant, return an empty list."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [129]:
extraction_functions = [convert_to_openai_function(Info)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name":"Info"})
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [130]:
extraction_chain.invoke({"input": page_content})

[{'title': 'Chain-of-Thought Prompting', 'author': 'Wei et al. (2022)'},
 {'title': 'Zero-shot CoT Prompting', 'author': 'Kojima et al. (2022)'},
 {'title': 'Automatic Chain-of-Thought (Auto-CoT)',
  'author': 'Zhang et al. (2022)'}]

In [131]:
extraction_chain.invoke({"input": "hi"})

[]

Split into pieces of texts and pass to the LLM and combine all the results of the end

In [132]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [133]:
splits = text_splitter.split_text(doc.page_content)

In [134]:
len(splits)

2

In [135]:
# Function to flatten a 2D matrix into a 1D list
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [136]:
from langchain.schema.runnable import RunnableLambda

In [137]:
# This RunnableLambda prepares the input for extraction by splitting it into chunks
# and creating a list of dictionaries, each containing a chunk as the "input" value
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

In [138]:
prep.invoke("hi")

[{'input': 'hi'}]

In [139]:
chain = prep | extraction_chain.map() | flatten

In [140]:
chain.invoke(doc.page_content)

[{'title': 'Chain-of-Thought Prompting', 'author': 'Wei et al. (2022)'},
 {'title': 'Zero-shot CoT Prompting', 'author': 'Kojima et al. (2022)'},
 {'title': 'Automatic Chain-of-Thought (Auto-CoT)',
  'author': 'Zhang et al. (2022)'}]