In [25]:
import warnings
warnings.simplefilter("ignore")

In [26]:
import pandas as pd
import numpy as np

In [27]:
import os
import openai

In [28]:
from langchain.chat_models import ChatOpenAI

In [29]:
from langchain.prompts import ChatPromptTemplate

In [30]:
import openai
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# 5. Load cleaned data

In [31]:
df = pd.read_csv("online_retail_II_cleaned_with_cost.csv")

In [32]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,profit_margin,Cost_price
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,0.193525,5.605004
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0.47585,3.538012
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0.368677,4.26143
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,0.303343,1.46298
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,0.086449,1.141939


In [33]:
len(df['Description'].unique().tolist())

3982

# 6. Setup LLM

set keys and model

In [34]:
OPENAI_API_KEY = ''

In [35]:
llm_model = "gpt-4o-mini"

set chain

In [36]:
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=llm_model, temperature=0)

In [37]:
template = """

    You are tasked with tagging a product based on the given category.
    Please follow 'ProductTaggingSchema'

    **Product name**: {prod_name}
"""

tagging_prompt = ChatPromptTemplate.from_template(template)

In [38]:
ProductTaggingSchema = {
    "title": "ProductTaggingSchema",
    "type": "object",
    "description": "Schema for tagging product category",
    "properties": {
        "product_category": {
            "type": "string",
            "description": """

            A string indicating the best category for the product. Choose only one category from the list provided and do not create a new category.'.

            """,
            "enum": ["Arts and Crafts", "Automotive", "Books and Stationery", "Clothing and Accessories", "Crafts and Hobbies"
                     , "Electronics", "Food and Beverages", "Health and Beauty", "Home and Garden", "Kitchen and Dining"
                     , "Sports and Outdoors", "Tools and Home Improvement", "Toys and Games", "Pet", "Office Supplies", "Uncategorized"],
            "default": "Uncategorized"
        }
    },
    "required": ["product_category"]
}

In [39]:
llm_structured = llm.with_structured_output(ProductTaggingSchema)

In [40]:
chain = tagging_prompt | llm_structured

In [41]:
def tag_product(prod_name):
    res = chain.invoke(prod_name)
    
    return res['product_category']

In [42]:
tag_product('A chicken freeze')

'Food and Beverages'

# 7. Tag all product

In [43]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,profit_margin,Cost_price
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,0.193525,5.605004
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0.47585,3.538012
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0.368677,4.26143
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,0.303343,1.46298
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,0.086449,1.141939


In [44]:
item_list = []
cat_list = []
count = 1

In [45]:
for item_input in df['Description'].unique().tolist():
    cat = tag_product(item_input)
    cat_list.append(cat)
    item_list.append(item_input)
    print(count, item_input, ":",cat)
    count+= 1

1 15CM CHRISTMAS GLASS BALL 20 LIGHTS : Home and Garden
2 PINK CHERRY LIGHTS : Home and Garden
3 WHITE CHERRY LIGHTS : Home and Garden
4 RECORD FRAME 7" SINGLE SIZE : Arts and Crafts
5 STRAWBERRY CERAMIC TRINKET BOX : Arts and Crafts
6 PINK DOUGHNUT TRINKET POT : Home and Garden
7 SAVE THE PLANET MUG : Kitchen and Dining
8 FANCY FONT HOME SWEET HOME DOORMAT : Home and Garden
9 CAT BOWL : Pet
10 DOG BOWLCHASING BALL DESIGN : Pet
11 HEART MEASURING SPOONS LARGE : Kitchen and Dining
12 LUNCHBOX WITH CUTLERY FAIRY CAKES : Kitchen and Dining
13 DOOR MAT BLACK FLOCK : Home and Garden
14 LOVE BUILDING BLOCK WORD : Toys and Games
15 HOME BUILDING BLOCK WORD : Toys and Games
16 ASSORTED COLOUR BIRD ORNAMENT : Arts and Crafts
17 PEACE WOODEN BLOCK LETTERS : Arts and Crafts
18 CHRISTMAS CRAFT WHITE FAIRY : Crafts and Hobbies
19 HEART IVORY TRELLIS LARGE : Home and Garden
20 HEART FILIGREE DOVE LARGE : Arts and Crafts
21 FULL ENGLISH BREAKFAST PLATE : Food and Beverages
22 PIZZA PLATE IN BOX : Kit

In [46]:
set(cat_list)

{'Arts and Crafts',
 'Automotive',
 'Books and Stationery',
 'Clothing and Accessories',
 'Crafts and Hobbies',
 'Electronics',
 'Food and Beverages',
 'Health and Beauty',
 'Home and Garden',
 'Kitchen and Dining',
 'Office Supplies',
 'Pet',
 'Sports and Outdoors',
 'Tools and Home Improvement',
 'Toys and Games',
 'Uncategorized'}

# 8. Append to dataframe

In [49]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,profit_margin,Cost_price
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,0.193525,5.605004
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0.47585,3.538012
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0.368677,4.26143
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,0.303343,1.46298
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,0.086449,1.141939


In [50]:
len(cat_list)

3982

In [51]:
len(item_list)

3982

In [52]:
df_cat = pd.DataFrame({
    'Category': cat_list,
    'Description': item_list
})

In [53]:
df_cat.head()

Unnamed: 0,Category,Description
0,Home and Garden,15CM CHRISTMAS GLASS BALL 20 LIGHTS
1,Home and Garden,PINK CHERRY LIGHTS
2,Home and Garden,WHITE CHERRY LIGHTS
3,Arts and Crafts,"RECORD FRAME 7"" SINGLE SIZE"
4,Arts and Crafts,STRAWBERRY CERAMIC TRINKET BOX


In [54]:
df = pd.merge(df, df_cat, on='Description',how='inner')

In [56]:
df = df[df['Category'] != 'Uncategorized']

In [58]:
df.shape

(397432, 11)

In [59]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,profit_margin,Cost_price,Category
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,0.193525,5.605004,Home and Garden
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0.47585,3.538012,Home and Garden
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0.368677,4.26143,Home and Garden
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,0.303343,1.46298,Arts and Crafts
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,0.086449,1.141939,Arts and Crafts


# 9. Export cleaned data

In [61]:
df.to_csv('online_retail_II_cleaned_with_cost_and_cat.csv', sep=',', index=False)