In [1]:
import pandas as pd
import google.genai as genai
import json
import time
import os

In [2]:

# 1. Configuration
from dotenv import load_dotenv
load_dotenv()
API_KEY = os.getenv("GOOGLE_API_KEY")
# set the env var the library expects instead of calling a non-existent configure()
os.environ["GOOGLE_API_KEY"] = API_KEY

In [3]:
# Add a system prompt to guide the model's behavior
systemPrompt = """
You will extract the brand and product name from a product description text. Follow these steps precisely:

1. First, analyze the input text carefully inside <thinking> tags:
   - Look for the brand name, typically at the start or in a specific location
   - Identify the product name, which usually follows the brand name
   - Consider different text formats and potential variations

2. Parse the text using these rules:
   - Brand is typically the first word or a capitalized name before the product description
   - Product name is the general category or specific type of item
   - Ignore additional descriptive text after the product name

3. Create the output as a valid JSON object with two keys: "Brand" and "Product".
**Output ONLY the JSON, nothing else. Do not include any tags, explanations, or extra text.**

4. Important guidelines:
   - Be precise in extraction
   - If unsure, default to the most prominent text
   - Trim any extra whitespace
   - Do not include additional descriptors beyond the core brand and product name

Example process:
<example>
Input: "RisoPhy Mechanical Gaming Keyboard, RGB 104 Keys Ultra-Slim LED Backlit USB Wired Keyboard with Blue Switch..."
<thinking>
- Brand: RisoPhy
- Product: Mechanical Gaming Keyboard
</thinking>
Output: {"Brand": "RisoPhy", "Product": "Mechanical Gaming Keyboard"}
</example>

5. Generate the final output inside <result> tags:
<result>
{"Brand": "<extracted brand>", "Product": "<extracted product name>"}
</result>

If no clear brand or product can be determined, return:
<result>
{"Brand": "Unknown", "Product": "Unknown"}
</result>
"""

In [4]:

# Initialize the client using the new google-genai SDK
client = genai.Client(api_key=API_KEY)

In [5]:

def call_gemini_extraction(text):
    """
    Calls Gemini API to extract brand/product and returns a dictionary.
    Includes a small sleep to help avoid rate limit errors during batch apply.
    """
    if not text or pd.isna(text):
        return {"Brand": "Unknown", "Product": "Unknown"}
    
    try:
        # Use the client to generate content with the new SDK syntax
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=f"Extract brand and product from this description: {text}",
            config={"system_instruction": systemPrompt}
        )
        
        # Clean the response text
        raw_text = response.text.strip()
        
        # Extract content from <result> tags if present
        if "<result>" in raw_text:
            raw_text = raw_text.split("<result>")[-1].split("</result>")[0].strip()
            
        raw_text = raw_text.replace('```json', '').replace('```', '')
        return json.loads(raw_text)
    
    except Exception as e:
        print(f"Error processing row: {e}")
        # Wait slightly if we hit a rate limit
        time.sleep(1) 
        return {"Brand": "Error", "Product": "Error"}

In [6]:
def iterative_extraction(df_to_process):
    """
    Iteratively processes rows that result in an 'Error' until all rows are successful.
    """
    df_remaining = df_to_process.copy()
    all_okay = []

    while not df_remaining.empty:
        print(f"Processing {len(df_remaining)} rows...")
        df_remaining['extracted_dict'] = df_remaining['Title'].apply(call_gemini_extraction)
        mask_error = df_remaining['extracted_dict'].astype(str).str.contains(r'\bError\b', na=False)
        df_okay = df_remaining[~mask_error].copy()
        df_rerun = df_remaining[mask_error].copy().drop(columns=['extracted_dict'])
        all_okay.append(df_okay)
        if df_rerun.empty:
            break
        print(f"Found {len(df_rerun)} errors. Retrying...")
        df_remaining = df_rerun

    # Concatenate all successful batches
    return pd.concat(all_okay, ignore_index=True)

In [7]:
# 2. Setup the project Data
df = pd.read_csv(r"data\productData_gemini.csv")

In [8]:
# 3. Execution
print("Extracting data via Gemini API...")

# Apply the function to the column
df_final = iterative_extraction(df)

Extracting data via Gemini API...
Processing 142 rows...
Error processing row: Expecting value: line 1 column 1 (char 0)
Error processing row: Expecting value: line 1 column 1 (char 0)
Error processing row: Expecting value: line 1 column 1 (char 0)
Error processing row: Expecting value: line 1 column 1 (char 0)
Found 4 errors. Retrying...
Processing 4 rows...
Error processing row: Expecting value: line 1 column 1 (char 0)
Found 1 errors. Retrying...
Processing 1 rows...


In [9]:
# Break the extracted dictionary into separate columns
df_final['Brand_New'] = df_final['extracted_dict'].apply(lambda x: x.get('Brand', 'Unknown'))
df_final['Product_New'] = df_final['extracted_dict'].apply(lambda x: x.get('Product', 'Unknown'))

In [10]:

# 4. Expand the dictionary column into 'Brand' and 'Product' columns
# result_type='expand' turns the dict into new columns automatically
# df_final = pd.concat([df, df['extracted_dict'].apply(pd.Series)], axis=1)

# Display result
print(df_final[['Title', 'Brand_New', 'Product_New']])

                                                 Title    Brand_New  \
0    RisoPhy Mechanical Gaming Keyboard, RGB 104 Ke...      RisoPhy   
1    RedThunder K10 Wireless Gaming Keyboard and Mo...   RedThunder   
2    Redragon Mechanical Gaming Keyboard, Wired Mec...     Redragon   
3    KEMOVE K98SE Mechanical Gaming Keyboard, 98 Ke...       KEMOVE   
4    Redragon K552 Mechanical Gaming Keyboard RGB L...     Redragon   
..                                                 ...          ...   
137  GUSGU G910 2K Quad HD Webcam for PC, with Micr...        GUSGU   
138  HUANUO Ergonomic Office Chair, High Back Desk ...       HUANUO   
139  Logitech Lift Vertical Ergonomic Mouse, Wirele...     Logitech   
140  EMEET 1080P Webcam with Microphone, C960 Web C...        EMEET   
141  Brick Attic Office Chair, Ergonomic Desk Chair...  Brick Attic   

                                      Product_New  
0                      Mechanical Gaming Keyboard  
1    K10 Wireless Gaming Keyboard and Mouse

In [11]:
df_final.to_csv(r"data\productData_gemini_extracted.csv", index=False)