## About
This notebook contains examples of named entity recognition (NER) for products and queries with few-shot learning

In [50]:
import os
from langchain.chat_models import ChatOpenAI, ChatGooglePalm
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI
from langchain import PromptTemplate
from langchain import LLMChain
from langchain.llms import Replicate

from langchain.schema import StrOutputParser
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache
from langchain.callbacks import get_openai_callback

from langchain.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)
from langchain.output_parsers import CommaSeparatedListOutputParser
from dotenv import load_dotenv


In [51]:
set_llm_cache(SQLiteCache(database_path=".langchain.db"))


In [52]:
load_dotenv("../.env") 


True

In [53]:
llm_openai = ChatOpenAI(model="gpt-4")


In [54]:
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from typing import Optional
from typing_extensions import Annotated
from enum import Enum
from typing import Union
import pandas as pd

## Output Formatting
- Defining possible values for entities to LLM (gender)
- Providing output schema

In [59]:
class Gender(str, Enum):
    male = 'male'
    female = 'female'
    unisex = 'unisex'
    other = 'other'
    not_given = 'not_given'


class ProductUnderstanding(BaseModel):
    brand: Optional[str] = Field(description="brand", )
    gender: Annotated[Union[Gender, None], Field(alias='Gender')] = None

    #gender: Optional[str] = Field(description="gender")
    product_type: Optional[str] = Field(description="product_type")
    color: Optional[str] = Field(description="color")
    size: Optional[str] = Field(description="size")

In [60]:
parser = PydanticOutputParser(pydantic_object=ProductUnderstanding)


In [61]:
template = """
You are e-commerce expert. Your task is to extract the attributes from customer query. 
Possible attributes are "product type", "brand", "gender", "color", "size".

Few shot Examples:

Input:
Query: yellow 35 inch baseball bat
Output: 
brand: None
gender: None
product_type: baseball bats
color: black
size: 35 inch

Input:   
Query: MOERDENG Men's Waterproof Ski Jacket Warm Winter Snow Coat Mountain Windbreaker Hooded Raincoat
Output: 
brand: MOERDENG
gender: Men
product_type: Ski Jacket
color: None
size: None

Input:   
Query: {query}
Output: 



Format instructions:
{format_instructions}
Answer: """

In [62]:
prompt = PromptTemplate(template=template, input_variables=["query"],
                                 partial_variables={"format_instructions":  parser.get_format_instructions()}
                                 )


In [63]:
print ( prompt.format(query = "nike men shoes" ) )



You are e-commerce expert. Your task is to extract the attributes from customer query. 
Possible attributes are "product type", "brand", "gender", "color", "size".

Few shot Examples:

Input:
Query: yellow 35 inch baseball bat
Output: 
brand: None
gender: None
product_type: baseball bats
color: black
size: 35 inch

Input:   
Query: MOERDENG Men's Waterproof Ski Jacket Warm Winter Snow Coat Mountain Windbreaker Hooded Raincoat
Output: 
brand: MOERDENG
gender: Men
product_type: Ski Jacket
color: None
size: None

Input:   
Query: nike men shoes
Output: 



Format instructions:
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

In [68]:
#sample_product_title = "nike men shoes in green"
sample_product_title = "MOERDENG Men's Waterproof Ski Jacket Warm Winter Snow Coat Mountain Windbreaker Hooded Raincoat"

chain = prompt | llm_openai 

output= chain.invoke({"query": sample_product_title})

In [69]:
output

AIMessage(content='{"brand": "MOERDENG", "Gender": "male", "product_type": "Ski Jacket", "color": null, "size": null}')

In [70]:
product_info = parser.parse(output.content)
product_info

ProductUnderstanding(brand='MOERDENG', gender=<Gender.male: 'male'>, product_type='Ski Jacket', color=None, size=None)

In [71]:
product_info.gender

<Gender.male: 'male'>

## Data Generation For Fine Tuning

In [72]:
df = pd.read_parquet("../data/cleaned_input.parquet")

In [16]:
df.columns

Index(['query', 'product_id', 'relevance_label', 'product_title',
       'product_description', 'product_bullet_point', 'product_brand',
       'product_color', 'url_product', 'url_image', 'query_type'],
      dtype='object')

In [17]:
sample_queries = list(df.drop_duplicates(['query']) .head(1000) ['query'])
sample_queries[:5]

[' revent 80 cfm',
 'bathroom fan without light',
 'bathroom fan with light',
 '110cfm bathroom exhaust fan without light',
 '12 inch bathroomwall mounted fan']

In [18]:
sample_items = list(df.drop_duplicates(['product_title']) .head(1000) ['product_title'])
sample_items[:5]

['Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceiling Mounted Fan',
 'Homewerks 7141-80 Bathroom Fan Integrated LED Light Ceiling Mount Exhaust Ventilation, 1.1 Sones, 80 CFM',
 'Homewerks 7140-80 Bathroom Fan Ceiling Mount Exhaust Ventilation, 1.5 Sones, 80 CFM, White',
 'Delta Electronics RAD80L BreezRadiance 80 CFM Heater/Fan/Light Combo White (Renewed)',
 'Panasonic FV-08VRE2 Ventilation Fan with Recessed LED (Renewed)']

In [19]:
final_samples = sample_queries + sample_items

In [20]:
chain = prompt | llm_openai 

In [21]:
outputs = []

In [22]:
for record in final_samples:
    try:
        output= chain.invoke({"query": record})
        product_info = parser.parse(output.content)
        outputs.append((record,product_info))
    except Exception as e:
        print (f"Failed for {record}", e)

Failed for fear factor games 1 validation error for ProductUnderstanding
Gender
  Input should be 'male', 'female', 'other' or 'not_given' [type=enum, input_value='None', input_type=str]
Failed for dont go without me comic Failed to parse ProductUnderstanding from completion {
"brand": None,
"Gender": "not_given",
"product_type": "comic",
"color": None,
"size": None
}. Got: Expecting value: line 2 column 10 (char 11)
Failed for stray kids shirt without woojin 1 validation error for ProductUnderstanding
Gender
  Input should be 'male', 'female', 'other' or 'not_given' [type=enum, input_value='None', input_type=str]
Failed for life without diabetes roy taylor 1 validation error for ProductUnderstanding
Gender
  Input should be 'male', 'female', 'other' or 'not_given' [type=enum, input_value='None', input_type=str]
Failed for Not sure. Maybe a a standard size (15 or 17??) screen middle of the range laptop that is either black or purple. It might be a Dell or Acer  3 validation errors for 

In [23]:
outputs

[(' revent 80 cfm',
  ProductUnderstanding(brand=None, gender=<Gender.not_given: 'not_given'>, product_type=None, color=None, size='80 cfm')),
 ('bathroom fan without light',
  ProductUnderstanding(brand=None, gender=<Gender.not_given: 'not_given'>, product_type='bathroom fan', color=None, size=None)),
 ('bathroom fan with light',
  ProductUnderstanding(brand=None, gender=<Gender.not_given: 'not_given'>, product_type='bathroom fan', color=None, size=None)),
 ('110cfm bathroom exhaust fan without light',
  ProductUnderstanding(brand=None, gender=None, product_type='bathroom exhaust fan', color=None, size='110cfm')),
 ('12 inch bathroomwall mounted fan',
  ProductUnderstanding(brand=None, gender=<Gender.not_given: 'not_given'>, product_type='wall mounted fan', color=None, size='12 inch')),
 ('bathroom fan',
  ProductUnderstanding(brand=None, gender=<Gender.not_given: 'not_given'>, product_type='bathroom fan', color=None, size=None)),
 ('bathroom fan quiet',
  ProductUnderstanding(brand=N

In [24]:
df_output = pd.DataFrame(outputs)

In [25]:
df_output.columns = ['query','output']

In [26]:
df_output['output']

0       brand=None gender=<Gender.not_given: 'not_give...
1       brand=None gender=<Gender.not_given: 'not_give...
2       brand=None gender=<Gender.not_given: 'not_give...
3       brand=None gender=None product_type='bathroom ...
4       brand=None gender=<Gender.not_given: 'not_give...
                              ...                        
1965    brand='HOMENOTE' gender=<Gender.not_given: 'no...
1966    brand='Classic Home and Garden' gender=<Gender...
1967    brand='1020 Trays' gender=<Gender.not_given: '...
1968    brand=None gender=None product_type='Growing T...
1969    brand='ANGTUO' gender=<Gender.not_given: 'not_...
Name: output, Length: 1970, dtype: object

In [27]:
df_output['output'] = df_output['output'].apply(lambda x: x.json() )

/tmp/ipykernel_910/412034866.py:1: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  df_output['output'] = df_output['output'].apply(lambda x: x.json() )


In [28]:
df_output.to_parquet("data/st_trainining_input.parquet")