# Tagging

#### Tagging means labeling a document with classes such as:

###### Sentiment.
  
###### Language.

###### Style (formal, informal etc.)

###### covered topics

###### political tendency

In [1]:
import getpass
import os

from dotenv import dotenv_values

config = dotenv_values(".env")

#### https://api.python.langchain.com/en/latest/chains/langchain.chains.openai_functions.tagging.create_tagging_chain_pydantic.html
#### https://api.python.langchain.com/en/latest/chains/langchain.chains.openai_functions.tagging.create_tagging_chain.html

In [2]:
from langchain.chains import create_tagging_chain, create_tagging_chain_pydantic
from langchain_openai import ChatOpenAI

In [3]:
os.environ["OPENAI_API_KEY"] = config.get("OPENAI_API_KEY")

In [53]:
# Schema
schema = {
    "properties": {
        "sentiment": {"type": "string"},
        "aggressiveness": {"type": "integer"},
        "language": {"type": "string"},
    }
}

# LLM
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
chain = create_tagging_chain(schema, llm)

In [54]:
inp = "Estoy increiblemente contento de haberte conocido! Creo que seremos muy buenos amigos! "
response = chain.run(inp)

In [55]:
response

{'language': 'Spanish'}

In [40]:
inp = "Estoy muy enojado con vos! Te voy a dar tu merecido!"
response = chain.run(inp)
response

{'sentiment': 'enojado', 'aggressiveness': 3, 'language': 'Spanish'}

In [41]:
inp = "Ich bin sehr wütend auf dich! Ich werde dir geben, was du verdienst!"
response = chain.run(inp)
response

{'sentiment': 'wütend', 'aggressiveness': 3, 'language': 'German'}

In [42]:
inp = "Я очень зол на тебя! Я дам тебе то, что ты заслуживаешь!"
response = chain.run(inp)
response

{'sentiment': 'negative', 'aggressiveness': 3, 'language': 'Russian'}

### Finer control

Careful schema definition gives us more control over the model’s output.

Specifically, we can define:

- possible values for each property
- description to make sure that the model understands the property
- required properties to be returned

In [48]:
schema = {
    "properties": {
        "aggressiveness": {
            "type": "integer",
            "enum": [1, 2, 3, 4, 5],
            "description": "describes how aggressive the statement is, the higher the number the more aggressive",
        },
        "language": {
            "type": "string",
            "enum": ["spanish", "english", "french", "german", "italian"],
        },
        "sentiment": {
            "type": "string",
            "enum": ["positive", "negative", "neutral"],
        },
    },
    "required": ["language", "sentiment", "aggressiveness"],
}

In [49]:
chain = create_tagging_chain(schema, llm)

In [50]:
inp = "Estoy increiblemente contento de haberte conocido! Creo que seremos muy buenos amigos!"
response = chain.run(inp)
response

{'aggressiveness': 1, 'language': 'spanish', 'sentiment': 'positive'}

In [51]:
inp = "Estoy muy enojado con vos! Te voy a dar tu merecido!"
response = chain.run(inp)
response


{'aggressiveness': 5, 'language': 'spanish', 'sentiment': 'negative'}

In [52]:
inp = "Weather is ok here, I can go outside without much more than a coat"
response = chain.run(inp)
response

{'aggressiveness': 1, 'language': 'english', 'sentiment': 'neutral'}

## https://docs.pydantic.dev/latest/

Pydantic is the most widely used data validation library for Python.

Fast and extensible, Pydantic plays nicely with your linters/IDE/brain. Define how data should be in pure, canonical Python 3.8+; validate it with Pydantic.

In [15]:
from langchain_core.pydantic_v1 import BaseModel, Field

In [16]:
class Tags(BaseModel):
    sentiment: str = Field(..., enum=["happy", "neutral", "sad"])
    aggressiveness: int = Field(
        ...,
        description="describes how aggressive the statement is, the higher the number the more aggressive",
        enum=[1, 2, 3, 4, 5],
    )
    language: str = Field(
        ..., enum=["spanish", "english", "french", "german", "italian"]
    )

In [17]:
chain = create_tagging_chain_pydantic(Tags, llm)

In [18]:
inp = "Estoy muy enojado con vos! Te voy a dar tu merecido!"
res = chain.run(inp)
res

Tags(sentiment='sad', aggressiveness=5, language='spanish')

#### https://python.langchain.com/docs/integrations/document_transformers

#### https://python.langchain.com/docs/integrations/document_transformers/openai_metadata_tagger

In [19]:
from langchain_community.document_transformers.openai_functions import (
    create_metadata_tagger,
)
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI

In [20]:
schema = {
    "properties": {
        "movie_title": {"type": "string"},
        "critic": {"type": "string"},
        "tone": {"type": "string", "enum": ["positive", "negative"]},
        "rating": {
            "type": "integer",
            "description": "The number of stars the critic rated the movie",
        },
    },
    "required": ["movie_title", "critic", "tone"],
}



document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm)

In [26]:
original_documents = [
    Document(
        page_content="Review of The Bee Movie\nBy Roger Ebert\n\nThis is the greatest movie ever made. 4 out of 5 stars."
    ),
    Document(
        page_content="Review of The Godfather\nBy Anonymous\n\nThis movie was super boring. 1 out of 5 stars.",
        metadata={"reliable": False},
    ),
    Document(
        page_content="Review of The Godfather\nBy testing\n\nThis movie was ok. 2 or 3 out of 5 stars.",
        metadata={"reliable": True},
    ),
]

enhanced_documents = document_transformer.transform_documents(original_documents)

In [27]:
import json

print(
    *[d.page_content + "\n\n" + json.dumps(d.metadata) for d in enhanced_documents],
    sep="\n\n---------------\n\n",
)

Review of The Bee Movie
By Roger Ebert

This is the greatest movie ever made. 4 out of 5 stars.

{"movie_title": "The Bee Movie", "critic": "Roger Ebert", "tone": "positive", "rating": 4}

---------------

Review of The Godfather
By Anonymous

This movie was super boring. 1 out of 5 stars.

{"movie_title": "The Godfather", "critic": "Anonymous", "tone": "negative", "rating": 1, "reliable": false}

---------------

Review of The Godfather
By testing

This movie was ok. 2 or 3 out of 5 stars.

{"movie_title": "The Godfather", "critic": "testing", "tone": "negative", "rating": 2, "reliable": true}


In [28]:
## pydantic


from typing import Literal

from pydantic import BaseModel, Field


class Properties(BaseModel):
    movie_title: str
    critic: str
    tone: Literal["positive", "negative"]
    rating: int = Field(description="Rating out of 5 stars")


document_transformer = create_metadata_tagger(Properties, llm)
enhanced_documents = document_transformer.transform_documents(original_documents)

print(
    *[d.page_content + "\n\n" + json.dumps(d.metadata) for d in enhanced_documents],
    sep="\n\n---------------\n\n",
)

Review of The Bee Movie
By Roger Ebert

This is the greatest movie ever made. 4 out of 5 stars.

{"movie_title": "The Bee Movie", "critic": "Roger Ebert", "tone": "positive", "rating": 4}

---------------

Review of The Godfather
By Anonymous

This movie was super boring. 1 out of 5 stars.

{"movie_title": "The Godfather", "critic": "Anonymous", "tone": "negative", "rating": 1, "reliable": false}

---------------

Review of The Godfather
By testing

This movie was ok. 2 or 3 out of 5 stars.

{"movie_title": "The Godfather", "critic": "testing", "tone": "negative", "rating": 2, "reliable": true}
