In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import openai
import ast
from tqdm import tqdm

In [5]:
maxmara = pd.read_csv('maxmara_cleaned.csv')
netaporter = pd.read_csv('netaporter_cleaned.csv')
luisaviaroma = pd.read_csv('luisaviaroma_cleaned.csv')

In [21]:
openai.api_key = "API_KEY"

def sanitize_response(response):
    sanitized_content = response.choices[0].message.content.strip()
    sanitized_content = sanitized_content.lstrip('```json').rstrip('```').strip()
    return sanitized_content
    
def classify_product(rows):
    system_prompt = """
    Given the product details with color and material values in various forms, your task is to standardize these into commonly recognized or more famous names. 
    For example, if a material like "Cotton-Blend Poplin" is commonly referred to by another name, return the more widely accepted term. 
    Similarly, for colors like "Coral," if there’s a more standard name, use that. Ensure consistency and standardization across the color and material fields.
    """
    prompt = "Given the following product details, return the standardized Category, Color, and Material for each product in JSON format, with the Index number included:\n\n"
    for idx, row in rows.iterrows():
        prompt += f"""
        Index: {idx}
        Title: {row['main_title']}
        Description: {row['description']}
        Material: {row['material']}
        Color: {row['color']}
        """
    prompt += "\nProvide the output in this format:\n[\n  {\n    \"Index\": ..., \"Category\": ..., \"Color\": ..., \"Material\": ...\n  },\n  ...\n]"
    response = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
         {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
    ],
    max_tokens=4000,
    presence_penalty=0,
    temperature=0.1,
    top_p=0.9
    )
    sanitized_response = sanitize_response(response)
    return ast.literal_eval(sanitized_response)



In [22]:
netaporter_list = []
for i in tqdm(range(0, len(netaporter), 50)):
    chunk = netaporter.iloc[i:i + 50]
    result = classify_product(chunk)
    netaporter_list.extend(result)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [02:32<00:00, 19.08s/it]


In [27]:
new_data = pd.DataFrame(netaporter_list)
new_data.set_index('Index', inplace=True)
netaporter = netaporter.join(new_data)

In [29]:
luisaviaroma_list = []
for i in tqdm(range(0, len(luisaviaroma), 50)):
    chunk = luisaviaroma.iloc[i:i + 50]
    result = classify_product(chunk)
    luisaviaroma_list.extend(result)

100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [05:27<00:00, 19.29s/it]


In [30]:
new_data = pd.DataFrame(luisaviaroma_list)
new_data.set_index('Index', inplace=True)
luisaviaroma = luisaviaroma.join(new_data)

In [33]:
netaporter.to_csv('netaporter_AI.csv',index=False)
luisaviaroma.to_csv('luisaviaroma_AI.csv',index=False)