In [13]:
PRODUCTS_CSV = "data/products.csv"
PRODUCTS_LIMIT = 100

import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [9]:
import csv
import random


def load_products(products_csv, limit):
    """Load the products from the csv file and return them using yield"""

    all_products = []

    with open(products_csv) as f:
        reader = csv.DictReader(f)
        for index, row in enumerate(reader):
            all_products.append(row)

    logging.info("Loaded %s products", len(all_products))

    # Shuffle the products
    random.seed(42)
    random.shuffle(all_products)

    # Return the products
    for index, row in enumerate(all_products):
        if index >= int(limit):
            break
        yield row

In [51]:
import ollama
from tqdm import tqdm


class ProductDescriptionGenerator:
    """A class that uses an LLM to generate product descriptions."""

    def __init__(self, config={}, batch_size=20):
        """Initialize the generator with the config."""

        self.config = config
        self.batch_size = batch_size

    def load_products(self, products):
        """Load the products into the generator."""

        self.products = self.form_batches(products)

    def generate(self):
        """Generate the product descriptions."""

        # Generate the product descriptions
        for product_batch in tqdm(self.products):
            for product in product_batch:
                description_prompt = self.make_prompt(product)
                description = ollama.generate(model="llama3", prompt=description_prompt)
                yield {
                    "product": product,
                    "description": description["response"]
                }

    def parse_description(self, description):
        # Extract up to three sentences
        sentences = description.split(".")
        sentences = sentences[:3]

        # Join the sentences
        description = ". ".join(sentences)

        return description.strip()

    def make_prompt(self, product):
        """Create the prompt for the product."""

        prompt = f"""Product: {product["product_name"]}
        Write a detailed, concise, and unique description of the product above.
        Keep it to 3 sentences.  Distinguish it from similar products.
        Get straight to the description."""

        return prompt

    def form_batches(self, products):
        """Form batches of products."""

        batch = []
        for index, product in enumerate(products):
            if index % self.batch_size == 0 and index > 0:
                yield batch
                batch = []
            batch.append(product)

        # Yield the last batch
        yield batch

In [52]:
import os
import jsonlines

def save_product_descriptions(product_descriptions, output_file):

    # Get the size of the existing file
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            size = len(f.readlines())
    else:
        size = 0

    logging.info("Fast forward -- Skipping %s product descriptions", size)

    for index, product_description in enumerate(product_descriptions):
        # Skip the questions that have already been generated
        if index < size:
            logging.info("Fast forward -- Skipping product %s", index)
            continue

        with jsonlines.open(output_file, mode="a") as writer:
            writer.write(product_description)

In [54]:
logging.info(f"Generating product descriptions for {PRODUCTS_LIMIT} products.")
products = load_products(PRODUCTS_CSV, PRODUCTS_LIMIT)

# Create the generator
generator = ProductDescriptionGenerator()
generator.load_products(products)

products_with_description = generator.generate()

save_product_descriptions(products_with_description, "data/products.jsonl")    

5it [03:25, 41.09s/it]
