In [1]:
%pip install openai pydantic instructor lancedb pandas

Collecting openai
  Downloading openai-1.40.0-py3-none-any.whl.metadata (22 kB)
Collecting pydantic
  Downloading pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m[31m1.9 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting instructor
  Downloading instructor-1.3.7-py3-none-any.whl.metadata (14 kB)
Collecting lancedb
  Downloading lancedb-0.11.0-cp38-abi3-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Using cached anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting sn

In [52]:
# full text search engine inspired by lucene written in rust
%pip install tantivy 

Collecting tantivy
  Downloading tantivy-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading tantivy-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: tantivy
Successfully installed tantivy-0.22.0
Note: you may need to restart the kernel to use updated packages.


In [33]:
from rich import inspect as rinspect
from rich import print as rprint

# Context

This course includes example code for most topics. The example code relates to building a feature that answers user questions based on previous product reviews for a hardware e-commerce website.

This notebook builds the raw product description and reviews data.

When you go apply course lessons in your business, you will not have an equivalent of this notebook. Because the data you retrieve from your database to answer questions will be real data from your business.

In [2]:
import asyncio
from typing import List, Dict
import instructor
import json
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
client = instructor.from_openai(OpenAI())

In [4]:
client

<instructor.client.Instructor at 0x7f75337dfc40>

In [5]:
class Product(BaseModel):
    title: str
    description: str

In [6]:
reviews_per_product = 10

In [15]:
n_objects = 1
prompt = (
    f"Create a list of {n_objects} products someone might buy at a hardware store."
)
prompt += "Each product title should be repeated 2-3 times. Do not have any with duplicate product descriptions.\n"
prompt += "So each product with a given title should have some small distinctions apparent from the description.\n"
prompt += (
    "Products can be small (a screw), large (a bandsaw) or anywhere in between.\n"
)
prompt += "For each product, write a 2-3 sentence product description that might show up in a hardware retailers website underneath the product"
prompt += "Do not create product reviews that contradict specific facts in other reviews. "
prompt += "Contradicting subjective opinions in other reviews is ok only to the extent you would expect that in real data."
prompt += "Respond only with the list of products and descriptions."

In [12]:
print(prompt)

Create a list of 1 products someone might buy at a hardware storeEach product title should be repeated 2-3 times. Do not have any with duplicate product descriptions.
So each product with a given title should have some small distinctions apparent from the description.
Products can be small (a screw), large (a bandsaw) or anywhere in between.
For each product, write a 2-3 sentence product description that might show up in a hardware retailers website underneath the productDo not create product reviews that contradict specific facts in other reviews. Contradicting subjective opinions in other reviews is ok only to the extent you would expect that in real data.Respond only with the list of products and descriptions.


In [16]:
objects = client.chat.completions.create(
            model="gpt-4o-mini",
            response_model=List[Product],
            messages=[{"role": "user", "content": prompt}],
        )

In [17]:
objects

[Product(title='Cordless Drill', description='This powerful cordless drill features an ergonomic design perfect for all-day use. With 20 torque settings and a lithium-ion battery, it offers unmatched versatility and efficiency in any drilling task.'),
 Product(title='Cordless Drill', description='Our lightweight cordless drill comes equipped with a flexible LED work light to illuminate your workspace. The 18V battery provides ample power for tough materials, making it a great choice for DIY enthusiasts and professionals alike.'),
 Product(title='Cordless Drill', description='Engineered for precision, this cordless drill has a compact design that allows for maximum maneuverability in tight spaces. It includes a built-in battery indicator to show remaining power, ensuring you never run out of charge unexpectedly.'),
 Product(title='Adjustable Wrench', description='This adjustable wrench is made of high-strength steel, ensuring durability and long-lasting use. It features a smooth adjustm

In [1]:
import asyncio
from typing import List, Dict
import instructor
import json
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel

client = instructor.from_openai(OpenAI())


class Product(BaseModel):
    title: str
    description: str


reviews_per_product = 10


def generate_physical_objects(n_objects=150) -> List[Product]:
    prompt = (
        f"Create a list of {n_objects} products someone might buy at a hardware store"
    )
    prompt += "Each product title should be repeated 2-3 times. Do not have any with duplicate product descriptions.\n"
    prompt += "So each product with a given title should have some small distinctions apparent from the description.\n"
    prompt += (
        "Products can be small (a screw), large (a bandsaw) or anywhere in between.\n"
    )
    prompt += "For each product, write a 2-3 sentence product description that might show up in a hardware retailers website underneath the product"
    prompt += "Do not create product reviews that contradict specific facts in other reviews. "
    prompt += "Contradicting subjective opinions in other reviews is ok only to the extent you would expect that in real data."
    prompt += "Respond only with the list of products and descriptions."

    try:
        objects = client.chat.completions.create(
            model="gpt-4o-mini",
            response_model=List[Product],
            messages=[{"role": "user", "content": prompt}],
        )
        return objects
    except Exception as e:
        print(f"Error generating evals: {str(e)}")
        return []


objects = generate_physical_objects()
print(f"Created {len(objects)} unique objects")
print(f"First 10 objects: {objects[:10]}")

Created 114 unique objects
First 10 objects: [Product(title='Cordless Drill', description='This powerful cordless drill features a lightweight design and a 2-speed transmission, allowing you to tackle various tasks with ease. Ideal for both professionals and DIY enthusiasts, it comes with two batteries for extended usage.'), Product(title='Cordless Drill', description='Designed for versatility, this cordless drill offers 18 torque settings and a compact design perfect for tight spaces. Its ergonomic grip ensures comfort during prolonged use, making it an ideal tool for any project.'), Product(title='Cordless Drill', description='Equipped with a high-performance motor, this cordless drill ensures efficient drilling and driving. The LED light illuminates dark work areas, making it perfect for both indoor and outdoor tasks.'), Product(title='Adjustable Wrench', description='This adjustable wrench features a sleek design with a cushioned grip for added comfort during use. Its wide jaw open

We have created the list of objects. Now we will create the product reviews.

In [18]:
# Patch the AsyncOpenAI client
async_client = instructor.from_openai(AsyncOpenAI())

In [19]:
async_client

<instructor.client.AsyncInstructor at 0x7f750bb8ee90>

In [20]:
class Review(BaseModel):
    review: str

In [21]:
class AllObjectInfo(BaseModel):
    product_title: str
    product_description: str
    review: str

In [23]:
product = objects[0]; 
product.title, product.description

('Cordless Drill',
 'This powerful cordless drill features an ergonomic design perfect for all-day use. With 20 torque settings and a lithium-ion battery, it offers unmatched versatility and efficiency in any drilling task.')

In [None]:
n=2
prompt = f"""
        Write {n} realistic but detailed/specific product reviews that might show up on a hardware store's website.

        The reviews should be about the following product:
        Product Title: {product.title}
        Product Description: {product.description}
        
        Add many relevant and concrete facts about the products (this is for synthetic data generation, make up facts about each product as necessary).

        To see the format of a possible review, here is a review for a saw:
        ```
        I've enjoyed using this saw. It is lightweight and the battery lasts longer than other brands.
        I've been using it for 3 years now and it has been very durable. It was twice as expensive as the PX-500. But
        it is comfortable to hold because of the light weight.
        ```

        Respond only with the reviews, and nothing else.
        """

In [24]:
result = await async_client.chat.completions.create(
                model="gpt-4o",
                response_model=List[Review],
                messages=[{"role": "user", "content": prompt}],
            )

In [25]:
result

[Review(review='Length: 7 inches. Material: Stainless steel. A versatile tool for any carpentry, demolition, or DIY project.'),
 Review(review='Length: 10 inches. Made from high-carbon steel for enhanced durability. Perfect for prying nails, lifting panels, and general demolition work.'),
 Review(review='Compact 5-inch size. Ideal for tight spaces. Made from tempered steel to withstand rugged use.')]

In [28]:
# Patch the AsyncOpenAI client
async_client = instructor.from_openai(AsyncOpenAI())


class Review(BaseModel):
    review: str


class AllObjectInfo(BaseModel):
    product_title: str
    product_description: str
    review: str


async def make_reviews(
    product: Product, n: int, semaphore: asyncio.Semaphore = asyncio.Semaphore(1)
) -> List[AllObjectInfo]:
    async with semaphore:
        prompt = f"""
        Write {n} realistic but detailed/specific product reviews that might show up on a hardware store's website.

        The reviews should be about the following product:
        Product Title: {product.title}
        Product Description: {product.description}
        
        Add many relevant and concrete facts about the products (this is for synthetic data generation, make up facts about each product as necessary).

        To see the format of a possible review, here is a review for a saw:
        ```
        I've enjoyed using this saw. It is lightweight and the battery lasts longer than other brands.
        I've been using it for 3 years now and it has been very durable. It was twice as expensive as the PX-500. But
        it is comfortable to hold because of the light weight.
        ```

        Respond only with the reviews, and nothing else.
        """

        try:
            result = await async_client.chat.completions.create(
                model="gpt-4o",
                response_model=List[Review],
                messages=[{"role": "user", "content": prompt}],
            )
            return [
                AllObjectInfo(
                    product_title=product.title,
                    product_description=product.description,
                    review=r.review,
                )
                for r in result
            ]

        except Exception as e:
            print(f"Error generating FreeCAD code: {str(e)}")
            return []


async def create_synthetic_reviews(
    max_concurrency: int = 20, reviews_per_product: int = reviews_per_product
) -> List[AllObjectInfo]:
    out = []
    semaphore = asyncio.Semaphore(max_concurrency)
    tasks = [make_reviews(o, reviews_per_product, semaphore) for o in objects]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    for r in results:
        if not isinstance(r, Exception):
            out.extend(r)
    return out


# reviews = await create_synthetic_reviews()
reviews = await create_synthetic_reviews(max_concurrency=2, reviews_per_product=2)

In [29]:
reviews[:5]

[AllObjectInfo(product_title='Cordless Drill', product_description='This powerful cordless drill features an ergonomic design perfect for all-day use. With 20 torque settings and a lithium-ion battery, it offers unmatched versatility and efficiency in any drilling task.', review="I've been using this cordless drill for the past 6 months, and it's been a game-changer for my DIY projects. The 20 torque settings allow me to adjust the power precisely for each material I'm working with, whether it's wood, metal, or plastic. The lithium-ion battery charges quickly, often taking less than an hour to reach full capacity, and it easily lasts through a full day of work without needing a recharge. The ergonomic design is truly comfortable, reducing fatigue during extended use. One feature I particularly appreciate is the built-in LED light, which illuminates the work area perfectly, making it convenient to work in low-light conditions. Compared to my old drill, this one is significantly quieter 

Store the items to be retrieved in LanceDB

In [30]:
db = lancedb.connect("./lancedb")
func = get_registry().get("openai").create(name="text-embedding-3-small")

In [35]:
rinspect(db, help=True)

In [34]:
rinspect(func, help=True)

In [39]:
rinspect(func.VectorField, help=True)

In [37]:
rinspect(func.SourceField, help=True)

In [43]:
rinspect(LanceModel, help=True)

In [41]:
class Products(LanceModel):
    id: str = func.SourceField()
    title: str = func.SourceField()
    description: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()

In [44]:
products_table = db.create_table("products", schema=Products, mode="overwrite")

[2024-08-06T19:55:39Z WARN  lance::dataset] No existing dataset at /home/msivanes/Documents/1Projects/systematically-improving-rag/week1_bootstrap_evals/lancedb/products.lance, it will be created


In [45]:
products_table

LanceTable(connection=LanceDBConnection(/home/msivanes/Documents/1Projects/systematically-improving-rag/week1_bootstrap_evals/lancedb), name="products")

In [47]:
products_data = [
    {"id": f"{i}", "title": obj.title, "description": obj.description}
    for i, obj in enumerate(objects)
]

In [48]:
products_data[:5]

[{'id': '0',
  'title': 'Cordless Drill',
  'description': 'This powerful cordless drill features an ergonomic design perfect for all-day use. With 20 torque settings and a lithium-ion battery, it offers unmatched versatility and efficiency in any drilling task.'},
 {'id': '1',
  'title': 'Cordless Drill',
  'description': 'Our lightweight cordless drill comes equipped with a flexible LED work light to illuminate your workspace. The 18V battery provides ample power for tough materials, making it a great choice for DIY enthusiasts and professionals alike.'},
 {'id': '2',
  'title': 'Cordless Drill',
  'description': 'Engineered for precision, this cordless drill has a compact design that allows for maximum maneuverability in tight spaces. It includes a built-in battery indicator to show remaining power, ensuring you never run out of charge unexpectedly.'},
 {'id': '3',
  'title': 'Adjustable Wrench',
  'description': 'This adjustable wrench is made of high-strength steel, ensuring durab

In [49]:
products_table.add(products_data)

In [50]:
rinspect(products_table.create_fts_index, help=True)

In [53]:
products_table.create_fts_index('description', replace=True)

In [55]:
product_id_map = {
    p["title"]: p["id"] for p in products_table.to_pandas().to_dict("records")
}

In [56]:
product_id_map

{'Cordless Drill': '2', 'Adjustable Wrench': '5', 'Portable Workbench': '8'}

In [57]:
# TODO: How to add fts to multiple fields in lancedb
# Create FTS index on both title and description fields
products_table.create_fts_index(["title", "description"], replace=True)

In [58]:
product_id_map = {
    p["title"]: p["id"] for p in products_table.to_pandas().to_dict("records")
}

In [59]:
class Reviews(LanceModel):
    id: str = func.SourceField()
    product_title: str = func.SourceField()
    product_description: str = func.SourceField()
    review: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()

In [60]:
reviews_table = db.create_table("reviews", schema=Reviews, mode="overwrite")

[2024-08-06T20:30:34Z WARN  lance::dataset] No existing dataset at /home/msivanes/Documents/1Projects/systematically-improving-rag/week1_bootstrap_evals/lancedb/reviews.lance, it will be created


In [61]:
reviews_with_product_id = [
    {
        "id": f"{i}",
        "product_title": review.product_title,
        "product_description": review.product_description,
        "review": review.review,
    }
    for i, review in enumerate(reviews)
]

In [63]:
reviews_with_product_id[:5]

[{'id': '0',
  'product_title': 'Cordless Drill',
  'product_description': 'This powerful cordless drill features an ergonomic design perfect for all-day use. With 20 torque settings and a lithium-ion battery, it offers unmatched versatility and efficiency in any drilling task.',
  'review': "I've been using this cordless drill for the past 6 months, and it's been a game-changer for my DIY projects. The 20 torque settings allow me to adjust the power precisely for each material I'm working with, whether it's wood, metal, or plastic. The lithium-ion battery charges quickly, often taking less than an hour to reach full capacity, and it easily lasts through a full day of work without needing a recharge. The ergonomic design is truly comfortable, reducing fatigue during extended use. One feature I particularly appreciate is the built-in LED light, which illuminates the work area perfectly, making it convenient to work in low-light conditions. Compared to my old drill, this one is significa

In [64]:
reviews_table.add(reviews_with_product_id)

In [65]:
reviews_table.create_fts_index("review", replace=True)

In [66]:
db = lancedb.connect("./lancedb")
func = get_registry().get("openai").create(name="text-embedding-3-small")


class Products(LanceModel):
    id: str = func.SourceField()
    title: str = func.SourceField()
    description: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()


products_table = db.create_table("products", schema=Products, mode="overwrite")
products_data = [
    {"id": f"{i}", "title": obj.title, "description": obj.description}
    for i, obj in enumerate(objects)
]
products_table.add(products_data)
products_table.create_fts_index("description", replace=True)
product_id_map = {
    p["title"]: p["id"] for p in products_table.to_pandas().to_dict("records")
}


class Reviews(LanceModel):
    id: str = func.SourceField()
    product_title: str = func.SourceField()
    product_description: str = func.SourceField()
    review: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()


reviews_table = db.create_table("reviews", schema=Reviews, mode="overwrite")

reviews_with_product_id = [
    {
        "id": f"{i}",
        "product_title": review.product_title,
        "product_description": review.product_description,
        "review": review.review,
    }
    for i, review in enumerate(reviews)
]
reviews_table.add(reviews_with_product_id)
reviews_table.create_fts_index("review", replace=True)

In case you want to see the data quickly in a text editor, we also store the data in JSON.

In [4]:
with open("./reviews.json", "w") as f:
    json.dump([i.dict() for i in reviews], f)