In [2]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
import json
from functools import partial
import pandas as pd

In [3]:
load_dotenv()

genai.configure(api_key=os.environ.get('GEMINI_API_KEY'))

## Model

We build a class to handle API calls to the LLM, in case we decide to change the LLM model.

Please make sure you have obtained the appropriate API key before continuing.

In [9]:
class AbstractModel:
    def __init__(self, base: str):
        pass

    def query(self, **kwargs) -> str:
        raise NotImplementedError

class QueryException(Exception):
    def __init__(self, message: str):
        super().__init__(f"Failed to query to the LLM: {message}")

In [22]:
class GeminiModel(AbstractModel):
    def __init__(self, base: partial[str]):
        self.base = base
        self.client = genai.GenerativeModel('gemini-1.5-flash')

    def query(self, **kwargs) -> str:
        response = self.client.generate_content(self.base(**kwargs)).candidates[0]
        if not hasattr(response, 'content'):
            raise QueryException('LLM response does not have content')
        return response.content.parts[0].text

## Prompt

We design the prompts and attempt to get the answer here.

### Category

The following prompt is designed to get the suitable category of product.

In [12]:
CATEGORY_PROMPT = """
<instruction>
You are a sales assistant on a large e-commerce platform.
Your job is to recommend the correct product to the customers.

The customer is finding a suitable gift.
The specific description of what the user wants is given in the "userquery" tag below.
You are to determine an appropriate category of product that the user can buy as gift.

The category you select should be listed in the "categories" tag below.
You must start your response with one of the categories below.
You are not to format the category in any way.
Failure to start your response with one of the categories below will lead to catastrophic effects.

You are then to explain your choice of category on the next line.
</instruction>

<categories>
Hobbies & Books
Books
Music, Movies & Games
Music Instruments
Figures & Model Kits
Electronics
Cameras
Fashion
Toys
Collectibles
</categories>

<userquery>
{prompt}
</userquery>
"""

In [13]:
gemini_category_model = GeminiModel(CATEGORY_PROMPT.format)

In [14]:
gemini_category_model.query(prompt="A gift for my friend, who loves to read Harry Potter.")

'Books \nThis category is most suitable as it directly relates to reading and would likely offer Harry Potter related books. \n'

In [15]:
gemini_category_model.query(prompt="How are you today?")

'Hobbies & Books \nThis is too little information to suggest a gift.  I need more information!  Tell me about the person you are buying a gift for - their hobbies, interests, age, etc. \n'

### Product suggestion

The following prompt is designed to rank the products.

In [16]:
with open("products-books-sample.json", "r", encoding="utf-8") as f:
    products = json.loads(f.read())

In [23]:
PRODUCT_PROMPT = """
<instruction>
You are a sales assistant on a large e-commerce platform.
Your job is to recommend the correct product to the customers.

The customer is finding a suitable gift.
The specific description of what the user wants is given in the "userquery" tag below.
You are to rank the products based on the appropriateness for the gift occassion.

You are to follow the format specified in the "format" tag.
Failure to follow the format will lead to catastrophic consequences.
You are to include all products in your returned list.
At the end of your response, starting on a new line, you are to explain your ranking.
</instruction>

<format>
[id of product 1]
[id of product 2]
[id of product 3]
...
</format>

<products>
{products}
</products>

<userquery>
{prompt}
</userquery>
"""

In [24]:
gemini_product_model = GeminiModel(partial(PRODUCT_PROMPT.format, products=products))

In [25]:
gemini_product_model.query(prompt="A gift for my friend, who loves to read Harry Potter.")

'1213824\n4892132\n3218412\n1241423\n1241251\n\nThe first product is the only one directly related to Harry Potter, so it is ranked first. The second product is a World War 2 book, which might appeal to a reader of Harry Potter, so it is ranked second. The third product is a classic novel, which is not directly related to Harry Potter, but might be appealing to a reader of fantasy. The fourth and fifth products are not related to Harry Potter, so they are ranked last. \n'

### Product categorisation

The following prompt is to assign each product to a category.

In [7]:
df = pd.read_csv("../data/product_data.csv")
df

Unnamed: 0,title,description,price_sgd,number_sold
0,Creative Metal Simulation Notebook Computer Ke...,Material:Zinc Alloy\r\nGender:Women/Men\r\nSha...,1.78,12
1,Cream Bread Primary Color Sticky Cheese Slow R...,"Shipped In Boxes , So Don't Worry About Being ...",10.27,2
2,50 Styles English Text Enamel Brooches for Ani...,Product Details\r\nProduct status: 100% brand ...,1.05,3
3,JDM Samurai Warrior Moroecycle Car Sticker Jap...,Feature:\r\n-100% Brand new.\r\n-High Quality ...,2.13 - 2.44,3
4,"100 Styles of Lapel Pins, Cute Cartoon Black K...","Style: cartoon, fashion\r\nMaterial: Alloy\r\n...",1.06,8
...,...,...,...,...
840,"Simple card holder with Wristhand Short Rope, ...",1. Can stack 2 ID cards.\r\n2. Three-dimension...,1.98,1
841,Black card holder ins advanced for campus stud...,Product's name: hard card case\r\nBrand: Other...,1.88 - 3.27,1
842,20W Fast Charging Power Bank Powerbank 20000 M...,Follow our store to get OFF\r\nIn stock in Sin...,11.90,155
843,Remax Rpp-522 RPP-167 30000mAh Portable Smart ...,REMAX Fast Charge RPP522 30000 mAh QC PD\r\n\r...,10.20 - 23.49,141


In [31]:
OBTAIN_UNKNOWN_CATEGORY_PROMPT = """
<instruction>
You are a professional sales assistant working at a warehouse for a large e-commerce platform.
Your job is to assign a category to a product that has an unknown category.
The category must be as broad as possible, so that it can encompass a wide range of similar products.
The category should not be the same as the product name or description.
The product name and product description are given in the "product" tag below.

You are to respond with the category that you think best fits the product.
You must start your response with the category.
You must not format the category in any way.
You are then to explain your choice of category on the next line.
</instruction>

<product>
<title>{title}</title>
<description>{description}</description>
</product>
"""

In [32]:
gemini_assign_category_model = GeminiModel(OBTAIN_UNKNOWN_CATEGORY_PROMPT.format)

In [33]:
gemini_assign_category_model.query(title=df["title"][0], description=df["description"][0])

'Keychains\nThis product is a keychain that is made of metal and features a simulation of a notebook computer. \n'

In [37]:
categories = set()

for i in range(len(df)):
    response = gemini_assign_category_model.query(title=df["title"][i], description=df["description"][i])
    category = response.split("\n")[0].strip()
    categories.add(category)

categories

IndexError: list index out of range

In [40]:
categories

{'Accessories',
 'Aquarium Supplies',
 'Arts & Crafts',
 'Audio Equipment',
 'Automotive Accessories',
 'Baby Accessories',
 'Backpacks',
 'Bags',
 'Bath & Body',
 'Bathroom Accessories',
 'Beauty Accessories',
 'Beverages',
 'Camping & Outdoor',
 'Car Accessories',
 'Clothing',
 'Computer Accessories',
 'Craft Supplies',
 'Decorative Accessories',
 'Decorative Lighting',
 'Decorative Lights',
 'Decorative Stickers',
 'Decorative Supplies',
 'Doll Accessories',
 'Drinkware',
 'Drinkware Accessories',
 'Electronics',
 'Electronics Accessories',
 'Food',
 'Food & Beverage',
 'Gaming Accessories',
 'Gifts',
 'Hair Accessories',
 'Hair Care Products',
 'Handbags',
 'Hardware',
 'Health & Personal Care',
 'Home & Kitchen',
 'Home Decor',
 'Home Fragrance',
 'Home Goods',
 'Home Organization',
 'Home Storage & Organization',
 'Jewelry',
 'Keychains',
 'Kitchen & Dining',
 'Kitchen Accessories',
 'Kitchen Cleaning Supplies',
 'Kitchen Supplies',
 'Kitchen Utensils',
 'Lighting',
 'Lip Care',


In [41]:
reduced_categories = [
    "Accessories",
    "Arts & Crafts",
    "Automotive",
    "Bathroom Accessories",
    "Baby Accessories",
    "Fashion",
    "Electronics",
    "Personal Essentials",
    "Food & Beverages",
    "Household Items",
    "Health & Beauty",
    "Jewelry & Watches",
    "Kitchenware",
    "Office Supplies",
    "Sports & Outdoors",
    "Stationery",
    "Toys & Games",
]

In [44]:
CATEGORY_PROMPT = """
<instruction>
You are a sales assistant on a large e-commerce platform.
Your job is to assign the correct category to a product.
The product name and product description are given in the "product" tag below.
The avaialble categories are listed in the "categories" tag below.
You are to select the category that best fits the product.
If the product does not fit any of the categories,
you are to come up with the category that best fits the product.

You are to start your response with the category.
You are not to format the category in any way.
You are then to explain your choice of category on the next line.
</instruction>

<product>
<title>{title}</title>
<description>{description}</description>
</product>

<categories>
{categories}
</categories>
"""

In [45]:
gemini_categoriser_model = GeminiModel(partial(CATEGORY_PROMPT.format, categories="\n".join(reduced_categories)))

In [47]:
categories = []
for i in range(len(df)):
    if i % 100 == 0:
        print(f"Processing product {i}")

    try:
        response = gemini_categoriser_model.query(title=df["title"][i], description=df["description"][i])
    except Exception as e:
        print(f"Failed to categorise product {i}: {e}")
        categories.append(None)
        continue

    category = response.split("\n")[0].strip()
    if category not in reduced_categories:
        print(f"New category found {i}: {category}")
    categories.append(category)

assert len(categories) == len(df)

Processing product 0
Processing product 100
Processing product 200
New category found 220: Please provide the product title and description so I can assign the correct category.
Processing product 300
Processing product 400
Failed to categorise product 496: list index out of range
Processing product 500
Processing product 600
Processing product 700
Processing product 800


In [53]:
df.iloc[219:222]

Unnamed: 0,title,description,price_sgd,number_sold
219,Huyen Dinh Wave Smiley Face Flower casetify Mi...,"Note: Phone Case Only Included, Phone Not Incl...",6.39,17
220,title,description,price_sgd,number_sold
221,SYLVANIAN FAMILIES GATCHAPON JAPAN EXCLUSIVE (...,Welcome to the exciting world of SYLVANIAN FAM...,8.80,9


In [57]:
df.drop(index=220, inplace=True)
df.iloc[219:222]

Unnamed: 0,title,description,price_sgd,number_sold
219,Huyen Dinh Wave Smiley Face Flower casetify Mi...,"Note: Phone Case Only Included, Phone Not Incl...",6.39,17
221,SYLVANIAN FAMILIES GATCHAPON JAPAN EXCLUSIVE (...,Welcome to the exciting world of SYLVANIAN FAM...,8.8,9
222,Momogi 10pcs,Momogi 10pcs\r\n\r\n\r\n,2.0,347


In [60]:
categories.pop(220)
assert len(categories) == len(df)

In [65]:
response = gemini_categoriser_model.query(title=df["title"][495], description=df["description"][495])
response

'Sports & Outdoors\nThis is a towel specifically designed for use during sports or outdoor activities. \n'

In [67]:
categories[495] = response.split("\n")[0].strip()
assert len(categories) == len(df)

In [72]:
df["category"] = categories
df

Unnamed: 0,title,description,price_sgd,number_sold,category
0,Creative Metal Simulation Notebook Computer Ke...,Material:Zinc Alloy\r\nGender:Women/Men\r\nSha...,1.78,12,Jewelry & Watches
1,Cream Bread Primary Color Sticky Cheese Slow R...,"Shipped In Boxes , So Don't Worry About Being ...",10.27,2,Toys & Games
2,50 Styles English Text Enamel Brooches for Ani...,Product Details\r\nProduct status: 100% brand ...,1.05,3,Jewelry & Watches
3,JDM Samurai Warrior Moroecycle Car Sticker Jap...,Feature:\r\n-100% Brand new.\r\n-High Quality ...,2.13 - 2.44,3,Automotive
4,"100 Styles of Lapel Pins, Cute Cartoon Black K...","Style: cartoon, fashion\r\nMaterial: Alloy\r\n...",1.06,8,Jewelry & Watches
...,...,...,...,...,...
840,"Simple card holder with Wristhand Short Rope, ...",1. Can stack 2 ID cards.\r\n2. Three-dimension...,1.98,1,Stationery
841,Black card holder ins advanced for campus stud...,Product's name: hard card case\r\nBrand: Other...,1.88 - 3.27,1,Accessories
842,20W Fast Charging Power Bank Powerbank 20000 M...,Follow our store to get OFF\r\nIn stock in Sin...,11.90,155,Electronics
843,Remax Rpp-522 RPP-167 30000mAh Portable Smart ...,REMAX Fast Charge RPP522 30000 mAh QC PD\r\n\r...,10.20 - 23.49,141,Electronics


In [71]:
df[df["category"] == "Stationery"]

Unnamed: 0,title,description,price_sgd,number_sold,category
79,100PCS Kpop Boy Band Bangtan Boys Nam-joon Kim...,📣Ready Stock!&nbsp; Don't forget your store co...,1.95,3,Stationery
96,Sanrio Cartoon Sticker Set Korean Cute DIY Sti...,️Welcome to the thilo store. Happy shopping ️\...,0.75,1,Stationery
101,Sanrio Sticky Notes Memo Book N times Sticker ...,Welcome to the thilo store. Follow our store ...,2.00,7,Stationery
103,200/100/50PCS New Sanrio Hello KittyKuromi My ...,Quantity\r\n&nbsp;\r\n&nbsp;\r\n&nbsp;\r\nRead...,1.95 - 5.00,3,Stationery
104,50 100 PCS Stickers Pack Cute Pink Waterproof ...,"Patterns are well-designed, and the beautiful ...",1.53 - 2.19,1,Stationery
115,100PCS Anime Haikyuu!! Black And White Graffit...,Welcome to RecordingYourLife Store!\r\nReady S...,1.90,1,Stationery
118,100PCS Classic Mobile Emoji MEME Stickers For ...,📣Ready Stock!&nbsp; Don't forget your store co...,1.90,1,Stationery
119,200Pcs/set Cinnamoroll Hello Kitty Roll Sticke...,Stickable: Yes\r\nProcess: Printing\r\nSpecifi...,1.33,9,Stationery
122,Portable Photocards Storage Box Transparent Wa...,Brand new and high quality.\r\nItem Name:Photo...,2.88 - 3.08,3,Stationery
171,4Pcs/Pack Sanrio Ice Cream Shape Cute Cartoon ...,Specification\r\nStyling: Cartoon\r\nColor: Ra...,1.68,1,Stationery


In [73]:
df.to_csv("../data/product_data_categorised.csv", index=False)