In [1]:
# ==========================
# IMPORT LIBRARIES
# ==========================
# Standard library imports
import os
import re
import sqlite3
import time
from typing import Optional, Dict

# Third-party imports
import openai
import anthropic
from dotenv import load_dotenv


### Load Environment Variables

In [2]:
# ==========================
# LOAD CONFIGURATION
# ==========================
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")
DB_PATH = os.getenv("DB_PATH")

if not OPENAI_API_KEY:
    raise ValueError("OpenAI API key is missing. Check your .env file.")

#if not CLAUDE_API_KEY:
    #raise ValueError("Claude API key is missing. Check your .env file.")

if not DB_PATH:
    raise ValueError("Database path is missing. Check your .env file.")

openai.api_key = OPENAI_API_KEY

print("Environment variables loaded. Connections verified.")

Environment variables loaded. Connections verified.


#### Prompts

In [None]:
# ==========================
# PROMPT CONFIGURATION
# ==========================
PROMPTS: Dict[str, Dict[str, str]] = {
    "basic": {
        "system": "You are an expert in fashion market research with a specialty in identifying plus-size and inclusive fashion brands.",
        "user": (
            "Please provide {batch_size} unique plus-size or inclusive clothing brands along with their official website URLs.\n\n"
            "Requirements:\n"
            "1. DO NOT include any of these already collected brands: {excluded_brands}\n"
            "2. Focus on lesser-known, independent, or specialty brands. Avoid major mainstream retailers.\n"
            "3. Prioritize brands that offer extended sizing, inclusive sizing, or cater specifically to plus-size consumers.\n"
            "4. Online-only boutiques and niche labels are encouraged.\n"
            "5. Each brand must be unique within this batch.\n\n"
            "6. Verify that each brand’s website is active and currently selling clothing.\n\n"
            "Format:\nBrand Name - URL\n\n"
            "Example:\nEloquii - https://www.eloquii.com"
        )
    },

    "messy_list": {
        "system": "You are an expert in fashion market research. Your task is to extract valid plus-size clothing brand names and websites from a messy list.",
        "user": (
            "Here is a list of potential brands. Some may not be real or active. Please return only real, active clothing brands along with their official website URLs.\n\n"
            "Messy List:\n"
            "Chic Soul, Fashion Nova, Nasty Gal, Cooper Union, Peridot Robes, SuperfitHero, Tuesday of California, Altar PDX, Big Bud Press, Connally Goods, \n\n "
            "Good American, Softcore, Suk, Wray NYC, Ashley Stewart, ELOQUII, Garden Belle, Isabella Eve, Ivy City, Kitty and Vibe, NOOWORKS,  Shiny By Nature, "
            "Tunnel Vision, Ulla Popken, Vixen, What Lo Wants, America & Beyond, ASOS, Beyond Yoga, Curvy Beach, Duluth Trading Co, Fashion Brand Company, \n\n"
            "Fashion to Figure, Forest Ink, Lucy & Yak, Madewell, Mango, Mokuyobi, My Violet, Never Fully Dressed, New York & Co, NYDJ, \n\n"
            "One With, Revolve, Straight to Hell, Unique Vintage, Wax Poetic, 2020 Ave, Anthropologie, Arula, Berlook, City Chic, Confete, \n\n"
            "Dressed In Lala, FAYT The Label, Finesse, Jolyn, Lulu's, Mod Cloth, Popflex, Shop Akira, Vina of the Valley, Minga London, \n\n"
            "Vanusian Swim, Loud Bodies, Your Ptashka, Ivory Sheep, Sotela, Universal Standard, Alpine Butterfly, Azazi, Jessake, \n\n"
            "Nettle, Selkie, Tamara Malas, Free Label, GiaIRL, Samantha Pleet, Adrianna Papell, Baacal, Blackwood Castle, Chelsea Reece, \n\n"
            "Manners London, Miaou, Miss Candyholic, St Grace, Tradlands, Baltic Born, Bella and Bloom, Cocomelody, Haus Dahlia, Lisa Says Gah, \n\n"
            "Ralph Lauren, Reformation, Show Me Your Mumu, Wild Fang, For Love and Lemons, Leim, Vanessa Mooney, Fairy Tong, Beth Smith Textiles, Revelle,\n\n"
            "BHLDN, Marina Rinaldi, Untamed Petals, Mara Hoffman\n\n"
            "Format:\nBrand Name - URL"
        )
    }
}


#### Setup LLMs

In [33]:
# ==========================
# LLM SETTINGS
# ==========================
OPENAI_MODEL = "gpt-4"
OPENAI_TEMPERATURE = 0.7
OPENAI_MAX_TOKENS = 150

CLAUDE_MODEL = "claude-3-opus-20240229"
CLAUDE_TEMPERATURE = 0.7
CLAUDE_MAX_TOKENS = 150

# ==========================
# LLM CLIENT CLASSES
# ==========================
class OpenAIClient:
    def __init__(self, model: str = OPENAI_MODEL, temperature: float = OPENAI_TEMPERATURE, max_tokens: int = OPENAI_MAX_TOKENS):
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens

    def get_response(self, system_prompt: str, user_prompt: str) -> Optional[str]:
        try:
            response = openai.chat.completions.create(   # <- lowercase path
                model=self.model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                max_tokens=self.max_tokens,
                temperature=self.temperature
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"OpenAI API error: {e}")
            return None

class ClaudeClient:
    def __init__(self, api_key: str, model: str = CLAUDE_MODEL, temperature: float = CLAUDE_TEMPERATURE, max_tokens: int = CLAUDE_MAX_TOKENS):
        self.client = anthropic.Anthropic(api_key=api_key)
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens

    def get_response(self, system_prompt: str, user_prompt: str) -> Optional[str]:
        try:
            response = self.client.messages.create(
                model=self.model,
                max_tokens=self.max_tokens,
                temperature=self.temperature,
                system=system_prompt,
                messages=[
                    {"role": "user", "content": user_prompt}
                ]
            )
            return response.content[0].text.strip()
        except Exception as e:
            print(f"Claude API error: {e}")
            return None


#### Brand Collector Tool

In [None]:
# ==========================
# BRAND COLLECTOR
# ==========================
class BrandCollector:
    def __init__(self, db_path: str, llm_client):
        self.db_path = db_path
        self.llm_client = llm_client
        self.conn = sqlite3.connect(self.db_path)
        self.cursor = self.conn.cursor()
        self.collected_brands = self.load_existing_brands()

    def load_existing_brands(self):
        try:
            self.cursor.execute("SELECT LOWER(brand_name) FROM brands")
            return {row[0] for row in self.cursor.fetchall()}
        except sqlite3.Error as e:
            print(f"Database query error: {e}")
            return set()

    def clean_brand_name(self, brand_name: str) -> str:
        cleaned = re.sub(r'^\d+\.?\s*', '', brand_name)
        return cleaned.strip()

    def brand_exists(self, brand_name: str) -> bool:
        return brand_name.lower() in self.collected_brands

    def insert_brand(self, brand_name: str, brand_url: str):
        try:
            cleaned_name = self.clean_brand_name(brand_name)
            if not self.brand_exists(cleaned_name):
                self.cursor.execute("""
                    INSERT INTO brands (brand_name, brand_url)
                    VALUES (?, ?)
                """, (cleaned_name, brand_url))
                self.conn.commit()
                self.collected_brands.add(cleaned_name.lower())
                print(f"✅ Added: {cleaned_name} - {brand_url}")
            else:
                print(f"⚠️ Duplicate skipped: {cleaned_name}")
        except sqlite3.Error as e:
            print(f"❌ Insert error: {e}")

    def collect_brands_with_prompt(self, prompt_key: str, total_brands: int = 25, batch_size: int = 5, sleep_time: int = 10):
        prompts = PROMPTS.get(prompt_key)
        if not prompts:
            print(f"Prompt '{prompt_key}' not found.")
            return

        system_prompt = prompts["system"]
        num_batches = (total_brands + batch_size - 1) // batch_size

        # Track starting count from database
        starting_count = len(self.collected_brands)
        newly_added_brands = set()  # Track brands added in this session

        for batch in range(num_batches):
            print(f"\n📦 Batch {batch + 1} of {num_batches}")

            # Combine already-collected brands and just-added ones for exclusion
            all_excluded_brands = self.collected_brands.union(newly_added_brands)

            user_prompt = prompts["user"].format(
                batch_size=batch_size,
                excluded_brands=', '.join(sorted(all_excluded_brands)) if all_excluded_brands else "None"
            )

            response = self.llm_client.get_response(system_prompt, user_prompt)

            if response:
                for line in response.splitlines():
                    if " - " in line:
                        brand_name, brand_url = map(str.strip, line.split(" - ", 1))

                        # Skip if already collected or newly added in this run
                        if brand_name in all_excluded_brands:
                            continue

                        self.insert_brand(brand_name, brand_url)
                        newly_added_brands.add(brand_name)

            time.sleep(sleep_time)
            
        # Final summary
        added_count = len(self.collected_brands) - starting_count
        print(f"\n✅ Finished: Added {added_count} new brands across {num_batches} batches.")

    def close(self):
        if self.conn:
            self.conn.close()


#### Search for New Brands

In [None]:
# ==========================
# RUN SETTINGS
# ==========================
MODEL_CHOICE = "openai"          # Options: "openai" or "claude"
PROMPT_CHOICE = "basic"     # Options: "basic", "messy_list"
TOTAL_BRANDS = 100               # Total number of brands to collect
BATCH_SIZE = 5                   # Fixed batch size
SLEEP_TIME = 10                  # Delay between calls (seconds)

# ==========================
# MAIN EXECUTION
# ==========================
if __name__ == "__main__":

    if MODEL_CHOICE == "openai":
        llm_client = OpenAIClient()
    elif MODEL_CHOICE == "claude":
        llm_client = ClaudeClient(api_key=CLAUDE_API_KEY)
    else:
        raise ValueError("Unsupported LLM selected.")

    collector = BrandCollector(DB_PATH, llm_client)

    collector.collect_brands_with_prompt(
        prompt_key=PROMPT_CHOICE,
        total_brands=TOTAL_BRANDS,
        batch_size=BATCH_SIZE,
        sleep_time=SLEEP_TIME
    )

collector.close()


📦 Batch 1 of 20
⚠️ Duplicate skipped: Henning
⚠️ Duplicate skipped: Part & Parcel
⚠️ Duplicate skipped: Maree Pour Toi
⚠️ Duplicate skipped: Copper Union
⚠️ Duplicate skipped: Alice Alexander

✅ Finished: Added 0 new brands across 20 batches.

📦 Batch 2 of 20
✅ Added: Wray - https://wray.nyc/
⚠️ Duplicate skipped: Berriez
⚠️ Duplicate skipped: Loud Bodies
⚠️ Duplicate skipped: Big Fig Mattress
⚠️ Duplicate skipped: RebDolls

✅ Finished: Added 1 new brands across 20 batches.

📦 Batch 3 of 20
✅ Added: Nicolette Mason - https://www.nicolettemason.com/
✅ Added: Copper Hive Vintage - https://www.copperhivevintage.com/
⚠️ Duplicate skipped: Universal Standard
⚠️ Duplicate skipped: The Hour London
⚠️ Duplicate skipped: Viktoria Popova

✅ Finished: Added 3 new brands across 20 batches.

📦 Batch 4 of 20
✅ Added: And I Get Dressed - http://www.andigetdressed.com/
⚠️ Duplicate skipped: Zelie For She
✅ Added: Joolz Fashion - https://joolzfashion.com/
⚠️ Duplicate skipped: Mei Smith
✅ Added: Lala 