In [1]:
import json
import os
from typing import Dict, List

from bs4 import BeautifulSoup
from bs4.element import Tag
from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.core.schema import Document, ImageDocument
from llama_index.core.storage.storage_context import StorageContext
from llama_index.vector_stores.postgres import PGVectorStore
import marvin
from marvin.beta.assistants import Thread
import pandas as pd
import psycopg2
from pydantic import BaseModel
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from sqlalchemy import make_url

from app.assistants import customer_assistant, CustomerAssistant, instructions
from app.tools import get_products, playback_audio

In [2]:
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")

assert os.getenv("OPENAI_API_KEY") is not None, "OPENAI_API_KEY is not set"
assert POSTGRES_PASSWORD is not None, "POSTGRES_PASSWORD is not set"

## Product Information Extraction

In [3]:
url: str = "https://www.nike.com/w/mens-jordan-shoes-37eefznik1zy7ok"

In [4]:
if not os.path.exists("../data/products.csv"):
    browser: WebDriver = webdriver.Chrome()
    browser.get(url)

    product_position = 1 # data-product-position="1"
    products = []

    while True:
        try:
            product_card = browser.find_element(By.CSS_SELECTOR, f'[data-product-position="{product_position}"]')
            ActionChains(browser).move_to_element(product_card).perform()

            link: str = product_card.find_element(By.CSS_SELECTOR, '.product-card__link-overlay').get_attribute('href')
            image_url: str = product_card.find_element(By.CSS_SELECTOR, '.product-card__hero-image').get_attribute('src')
            title: str = product_card.find_element(By.CSS_SELECTOR, '.product-card__title').get_attribute('textContent')
            subtitle: str = product_card.find_element(By.CSS_SELECTOR, '.product-card__subtitle').get_attribute('textContent')

            try:
                count: str = product_card.find_element(By.CSS_SELECTOR, '.product-card__product-count').get_attribute('textContent')

            except NoSuchElementException:
                count = "N/A"

            try:
                price: str = product_card.find_element(By.CSS_SELECTOR, '.product-card__price').get_attribute('textContent')

            except NoSuchElementException:
                price = "N/A"

            product = {
                'position': product_position,
                '.product-card__link-overlay': {
                    'href': link
                },
                '.product-card__hero-image': {
                    'src': image_url
                },
                '.product-card__title': title,
                '.product-card__subtitle': subtitle,
                '.product-card__product-count': count,
                '.product-card__price': price,
            }

            browser.get(link)
            link_soup: BeautifulSoup = BeautifulSoup(browser.page_source, 'html.parser')

            json_ld_scripts: List[Tag] = link_soup.find_all('script', {'type':'application/ld+json'})

            if len(json_ld_scripts) == 1:
                json_ld_script: Tag = json_ld_scripts[0]
                json_ld_script_text: str = json_ld_script.get_text(strip=True)
                json_ld_data: Dict = json.loads(json_ld_script_text)

                product["json_ld_script_text"] = json_ld_script_text

                product["name"] = product[".product-card__title"]
                product["description"] = json_ld_data["description"]

                product["price"] = product[".product-card__price"]

                # TODO: Extract price from json_ld_data, e.g.:
                # 'offers': {'@type': 'Offer', 'priceCurrency': 'USD', 'price': '200'
                # 'offers': {'@type': 'AggregateOffer', 'lowPrice': '107.97', 'highPrice': '165',

                if "offers" in json_ld_data:
                    if "price" in json_ld_data["offers"]:
                        price = json_ld_data["offers"]["price"]
                        product[f"price_{json_ld_data['offers']['priceCurrency']}_low"] = price
                        product[f"price_{json_ld_data['offers']['priceCurrency']}_high"] = price

                    elif "lowPrice" in json_ld_data["offers"] and "highPrice" in json_ld_data["offers"]:
                        product[f"price_{json_ld_data['offers']['priceCurrency']}_low"] = json_ld_data["offers"]["lowPrice"]
                        product[f"price_{json_ld_data['offers']['priceCurrency']}_high"] = json_ld_data["offers"]["highPrice"]

                product["image_url"] = product[".product-card__hero-image"]["src"]

                try:
                    product["rating"] = json_ld_data["aggregateRating"]["ratingValue"]

                except KeyError:
                    product["rating"] = "N/A"

                products.append(product)

            else:
                print(f"Skipping product at position {product_position} because there are {len(json_ld_scripts)} json-ld scripts")

            browser.back()
            product_position += 1

        except NoSuchElementException:
            break

    products_df = pd.DataFrame(products)
    products_df.to_csv("../data/products.csv", index=False)

products_df = pd.read_csv("../data/products.csv")
products_df.head()

Unnamed: 0,position,.product-card__link-overlay,.product-card__hero-image,.product-card__title,.product-card__subtitle,.product-card__product-count,.product-card__price,json_ld_script_text,name,description,price,price_USD_low,price_USD_high,image_url,rating
0,1,{'href': 'https://www.nike.com/t/air-jordan-4-...,{'src': 'https://static.nike.com/a/images/c_li...,"Air Jordan 4 Retro ""Oxidized Green""",Men's Shoes,1 Color,$215,"{\n ""@context"": ""https://schema.org"",\n ...","Air Jordan 4 Retro ""Oxidized Green""","Find the Air Jordan 4 Retro ""Oxidized Green"" a...",$215,215.0,215.0,"https://static.nike.com/a/images/c_limit,w_592...",
1,3,{'href': 'https://www.nike.com/t/jordan-spizik...,{'src': 'https://static.nike.com/a/images/c_li...,Jordan Spizike Low,Men's Shoes,,$160,"{\n ""@context"": ""https://schema.org"",\n ...",Jordan Spizike Low,Find the Jordan Spizike Low at Nike.com.,$160,160.0,160.0,"https://static.nike.com/a/images/c_limit,w_592...",4.8
2,4,{'href': 'https://www.nike.com/t/air-jordan-6-...,{'src': 'https://static.nike.com/a/images/c_li...,"Air Jordan 6 Retro ""White/Black""",Men's Shoes,1 Color,$200,"{\n ""@context"": ""https://schema.org"",\n ...","Air Jordan 6 Retro ""White/Black""","Find the Air Jordan 6 Retro ""White/Black"" at N...",$200,200.0,200.0,"https://static.nike.com/a/images/c_limit,w_592...",4.8
3,5,{'href': 'https://www.nike.com/t/tatum-2-denim...,{'src': 'https://static.nike.com/a/images/c_li...,"Tatum 2 ""Denim""",Basketball Shoes,,$125,"{\n ""@context"": ""https://schema.org"",\n ...","Tatum 2 ""Denim""","Find the Tatum 2 ""Denim"" at Nike.com.",$125,125.0,125.0,"https://static.nike.com/a/images/c_limit,w_592...",4.6
4,6,{'href': 'https://www.nike.com/t/air-jordan-xx...,{'src': 'https://static.nike.com/a/images/c_li...,Air Jordan XXXVIII Low,Basketball Shoes,,$175,"{\n ""@context"": ""https://schema.org"",\n ...",Air Jordan XXXVIII Low,Find the Air Jordan XXXVIII Low at Nike.com.,$175,175.0,175.0,"https://static.nike.com/a/images/c_limit,w_592...",4.6


## Image Data Augmentation

In [5]:
if not os.path.exists("../data/augmented_products.csv"):
    image_urls = products_df["image_url"].tolist()

    class AugmentedVisualAttributes(BaseModel):
        caption: str
        classification: str
        color: str
        item_type: str
        materials: str
        style: str

    extracted_features = marvin.extract.map(
        image_urls,
        target=AugmentedVisualAttributes,
        instructions="Please provide a caption, classification, color, item type, materials, and style for the image.",
    )

    augmented_products_df = products_df.copy()
    augmented_products_df["caption"] = [feature[0].caption for feature in extracted_features]
    augmented_products_df["classification"] = [feature[0].classification for feature in extracted_features]
    augmented_products_df["color"] = [feature[0].color for feature in extracted_features]
    augmented_products_df["item_type"] = [feature[0].item_type for feature in extracted_features]
    augmented_products_df["materials"] = [feature[0].materials for feature in extracted_features]
    augmented_products_df["style"] = [feature[0].style for feature in extracted_features]
    augmented_products_df.to_csv("../data/augmented_products.csv", index=False)

products_df = pd.read_csv('../data/augmented_products.csv')
products_df.head()

Unnamed: 0,position,.product-card__link-overlay,.product-card__hero-image,.product-card__title,.product-card__subtitle,.product-card__product-count,.product-card__price,json_ld_script_text,name,description,...,price_USD_low,price_USD_high,image_url,rating,caption,classification,color,item_type,materials,style
0,1,{'href': 'https://www.nike.com/t/air-jordan-4-...,{'src': 'https://static.nike.com/a/images/c_li...,"Air Jordan 4 Retro ""Oxidized Green""",Men's Shoes,1 Color,$215,"{\n ""@context"": ""https://schema.org"",\n ...","Air Jordan 4 Retro ""Oxidized Green""","Find the Air Jordan 4 Retro ""Oxidized Green"" a...",...,215.0,215.0,"https://static.nike.com/a/images/c_limit,w_592...",,Nike Air Jordan 4 Retro Oxidized Green Men's S...,Footwear,Oxidized Green,Shoes,"Leather, Rubber, Synthetic","Athletic, Retro"
1,3,{'href': 'https://www.nike.com/t/jordan-spizik...,{'src': 'https://static.nike.com/a/images/c_li...,Jordan Spizike Low,Men's Shoes,,$160,"{\n ""@context"": ""https://schema.org"",\n ...",Jordan Spizike Low,Find the Jordan Spizike Low at Nike.com.,...,160.0,160.0,"https://static.nike.com/a/images/c_limit,w_592...",4.8,Jordan Spizike Low Men's Shoes,Footwear,White/Black/Red,Shoes,Leather/Synthetic,Athletic/Casual
2,4,{'href': 'https://www.nike.com/t/air-jordan-6-...,{'src': 'https://static.nike.com/a/images/c_li...,"Air Jordan 6 Retro ""White/Black""",Men's Shoes,1 Color,$200,"{\n ""@context"": ""https://schema.org"",\n ...","Air Jordan 6 Retro ""White/Black""","Find the Air Jordan 6 Retro ""White/Black"" at N...",...,200.0,200.0,"https://static.nike.com/a/images/c_limit,w_592...",4.8,Air Jordan 6 Retro White Black Men's Shoes,Footwear,"White, Black",Shoes,"Leather, Rubber","Athletic, Retro"
3,5,{'href': 'https://www.nike.com/t/tatum-2-denim...,{'src': 'https://static.nike.com/a/images/c_li...,"Tatum 2 ""Denim""",Basketball Shoes,,$125,"{\n ""@context"": ""https://schema.org"",\n ...","Tatum 2 ""Denim""","Find the Tatum 2 ""Denim"" at Nike.com.",...,125.0,125.0,"https://static.nike.com/a/images/c_limit,w_592...",4.6,Nike Tatum 2 Denim Basketball Shoes,Footwear,Denim Blue,Shoes,"Denim, Rubber, Synthetic",Athletic
4,6,{'href': 'https://www.nike.com/t/air-jordan-xx...,{'src': 'https://static.nike.com/a/images/c_li...,Air Jordan XXXVIII Low,Basketball Shoes,,$175,"{\n ""@context"": ""https://schema.org"",\n ...",Air Jordan XXXVIII Low,Find the Air Jordan XXXVIII Low at Nike.com.,...,175.0,175.0,"https://static.nike.com/a/images/c_limit,w_592...",4.6,Air Jordan XXXVIII Low Basketball Shoes,Footwear,White/Black/Red,Shoes,Leather/Textile/Rubber,Athletic


## Conversational Chatbot

In [6]:
def create_index():
    def parse_doc_info(row: pd.Series) -> Dict:
        return {
            'id': f'id{row["position"]}',
            'caption': row['caption'],
            'metadata': {
                # 'caption': row['caption'],
                'classification': row['classification'],
                'color': row['color'],
                'description': row['description'],
                'item_type': row['item_type'],
                'materials': row['materials'],
                'name': row['name'],
                # 'price': row['price'],
                'price': str(row['price']), # TODO: do this above or accomodate for N/A to float nan conversion
                'price_USD_high': str(row['price_USD_high']),
                'price_USD_low': str(row['price_USD_low']),
                # 'rating': row['rating'],
                'rating': str(row['rating']), # TODO: do this above or accomodate for N/A to float nan conversion
                'style': row['style']
            },
            'uri': row['image_url']
        }

    doc_info_df = products_df.apply(parse_doc_info, axis=1)

    ids = doc_info_df.apply(lambda x: x['id']).tolist()
    captions = doc_info_df.apply(lambda x: x['caption']).tolist()
    metadatas = doc_info_df.apply(lambda x: x['metadata']).tolist()
    uris = doc_info_df.apply(lambda x: x['uri']).tolist()

    # image_documents = load_image_urls(uris)

    image_documents = [
        ImageDocument(
            doc_id=id,
            extra_info=metadata,
            image_url=uri,
            # text=caption,
            # text_embedding=LlamaIndexSettings.embed_model.get_text_embedding(caption),
        )
        for id, caption, metadata, uri in zip(ids, captions, metadatas, uris)
    ]

    text_documents = []

    # for id, caption, metadata, uri in zip(ids, captions, metadatas, uris):
    for id, caption, metadata, image_document in zip(ids, captions, metadatas, image_documents):
        # relationships: Dict[NodeRelationship, RelatedNodeType] = {
        #     NodeRelationship.CHILD: RelatedNodeInfo(
        #         # metadata=metadata,
        #         node_id=image_document.doc_id,
        #         node_type=ObjectType.IMAGE,
        #     )
        # }

        document = Document(
            doc_id=id,
            extra_info=metadata,
            # relationships=relationships,
            text=f"Caption: {caption}; Metadata: {json.dumps(metadata)}",
        )

        text_documents.append(document)

    documents = image_documents + text_documents

    connection_string = f"postgresql://postgres:{POSTGRES_PASSWORD}@localhost:5432"
    db_name = "vector_db"
    conn = psycopg2.connect(connection_string)
    conn.autocommit = True

    try:
        with conn.cursor() as c:
            c.execute(f"CREATE DATABASE {db_name}")

    except psycopg2.errors.DuplicateDatabase:
        pass

    conn = psycopg2.connect(f"{connection_string}/{db_name}")
    conn.autocommit = True

    try:
        with conn.cursor() as c:
            c.execute(f"DROP TABLE IF EXISTS {db_name}.public.data_llama_index_image_node_collection")
            c.execute(f"DROP TABLE IF EXISTS {db_name}.public.data_llama_index_text_node_collection")

    except psycopg2.errors.DuplicateDatabase:
        pass

    url = make_url(connection_string)

    text_store = PGVectorStore.from_params(
        database=db_name,
        host=url.host,
        password=url.password,
        port=url.port,
        user=url.username,
        table_name="llama_index_text_node_collection",
        embed_dim=1536,  # openai embedding dimension
        # hybrid_search
        # use_jsonb
    )

    image_store = PGVectorStore.from_params(
        database=db_name,
        host=url.host,
        password=url.password,
        port=url.port,
        user=url.username,
        table_name="llama_index_image_node_collection",
        embed_dim=512,  # openai embedding dimension
    )

    storage_context = StorageContext.from_defaults(
        vector_store=text_store, image_store=image_store,
    )

    return MultiModalVectorStoreIndex.from_documents(
        documents=documents,
        storage_context=storage_context,
        show_progress=True,
    )

index = create_index()

Parsing nodes:   0%|          | 0/176 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/88 [00:00<?, ?it/s]

Generating image embeddings:   0%|          | 0/88 [00:00<?, ?it/s]

In [7]:
# Example of calling the get_products function to get products that are blue and have a minimum rating of 4.5, returning the top 3 textually similar products.

get_products(
    color="Blue",
    min_rating=4.5, # TODO: nan shows up
    text_similarity_top_k=3,
)

[Product(name='Luka 2', price='$97.97$13024% off24% off', caption='Caption: Nike Luka 2 Basketball Shoes; Metadata: {"classification": "Footwear", "color": "White and Blue", "description": "Find the Luka 2 at Nike.com. ", "item_type": "Shoes", "materials": "Synthetic and Rubber", "name": "Luka 2", "price": "$97.97$13024% off24% off", "price_USD_high": "97.97", "price_USD_low": "71.97", "rating": "4.7", "style": "Athletic"}', classification='Footwear', color='White and Blue', item_type='Shoes', materials='Synthetic and Rubber', rating='4.7', style='Athletic'),
 Product(name='Luka 2 Team Bank', price='$71.97$13044% off44% off', caption='Caption: Nike Luka 2 Team Bank Basketball Shoes; Metadata: {"classification": "Footwear", "color": "White and Blue", "description": "Find the Luka 2 Team Bank at Nike.com. ", "item_type": "Shoes", "materials": "Synthetic and Rubber", "name": "Luka 2 Team Bank", "price": "$71.97$13044% off44% off", "price_USD_high": "71.97", "price_USD_low": "71.97", "rati

In [8]:
# Example of calling the get_products function to get products that are similar to the provided image, returning the top 3 visually similar products.

get_products(
    image_url="https://static.nike.com/a/images/c_limit,w_592,f_auto/t_product_v1/u_126ab356-44d8-4a06-89b4-fcdcc8df0245,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/5a862151-1d12-41ab-a8a7-acaa1fbe35cf/jordan-6-rings-mens-shoes-PFKJm7.png",
    image_similarity_top_k=3,
    text_similarity_top_k=0,
)

[Product(name='Air Jordan 1 Elevate High', price='$145', caption=None, classification='Footwear', color='White/Black', item_type='Shoes', materials='Leather/Rubber', rating='4.4', style='Athletic/Casual'),
 Product(name='Air Jordan 1 Low OG "Silver"', price='$140', caption=None, classification='Footwear', color='Silver', item_type='Shoes', materials='Leather, Rubber', rating='4.8', style='Athletic, Casual'),
 Product(name='Jordan Hydro 4 Retro', price='$65Extra 25% Off w/ CHILL25', caption=None, classification='Footwear', color='Black and Red', item_type='Slides', materials='Synthetic', rating='4.9', style='Athletic')]

In [9]:
# Example of calling the get_products function to get products where the price is between $100 and $150, returning the top 3 textually similar products.

get_products(
    min_price=100,
    max_price=150,
    text_similarity_top_k=3,
)

[Product(name='Jordan One Take 5 Quai 54', price='$100', caption='Caption: Jordan One Take 5 Quai 54 Basketball Shoes; Metadata: {"classification": "Footwear", "color": "Multicolor", "description": "Find the Jordan One Take 5 Quai 54 at Nike.com. ", "item_type": "Shoes", "materials": "Synthetic, Rubber", "name": "Jordan One Take 5 Quai 54", "price": "$100", "price_USD_high": "100.0", "price_USD_low": "100.0", "rating": "nan", "style": "Athletic"}', classification='Footwear', color='Multicolor', item_type='Shoes', materials='Synthetic, Rubber', rating='nan', style='Athletic'),
 Product(name='Jordan Air Ship PE SP', price='$140', caption='Caption: Jordan Air Ship PE SP Men\'s Shoes; Metadata: {"classification": "Footwear", "color": "White/Red", "description": "Find the Jordan Air Ship PE SP at Nike.com. ", "item_type": "Shoes", "materials": "Leather/Textile/Rubber", "name": "Jordan Air Ship PE SP", "price": "$140", "price_USD_high": "140.0", "price_USD_low": "140.0", "rating": "5.0", "st

In [10]:
# Example of talking to the customer assistant with a query about green shoes with a rating of at least 4.9. The assistant has access to the get_products function as a tool.

# customer_assistant.clear_default_thread()
thread = Thread(id=None) # Create a new thread

# run = customer_assistant.say("Do you have any green shoes that are rated at least 4.9?", thread=thread)

_customer_assistant = CustomerAssistant(
    name="Customer Assistant",
    instructions=instructions,
    tools=[
        get_products,
        # playback_audio,
    ],
)

run = _customer_assistant.say("Do you have any green shoes that are rated at least 4.9?", thread=thread) # TODO: figure out why there is visual repetition in the notebook for the assistant's response

Output()

Output()

ALSA lib pcm_dmix.c:999:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2666:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2666:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2666:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib pcm_dmix.c:999:(snd_pcm_dmix_open) unable to open slave


## Text-to-Speech Conversion

In [11]:
thread_id = run.thread.id

thread = Thread(id=thread_id)
messages = thread.get_messages()
messages

[Message(id='msg_Pw4nlbjze9Icbrihf7x2xlpv', assistant_id=None, attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='Do you have any green shoes that are rated at least 4.9?'), type='text')], created_at=1718423368, incomplete_at=None, incomplete_details=None, metadata={}, object='thread.message', role='user', run_id=None, status=None, thread_id='thread_dABrQePZwetOuHtBo9ygKTFo'),
 Message(id='msg_CIJ3IUpTBgELJYahpZ1ci4rQ', assistant_id='asst_rdT5iFcXGBrDKNXGINcRqM2h', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='Here is a highly-rated green shoe you might be interested in:\n\n**Air Jordan 1 Low OG "Black/Gorge Green"**\n- **Price**: $140\n- **Materials**: Leather, Rubber\n- **Style**: Casual, Athletic\n- **Rating**: 4.9\n- **Description**: Nike Air Jordan 1 Low OG in Black and Gorge Green for Women.\n\nIf you have any other questions or need more information, feel free to ask!'), type='text')], cre

In [12]:
last_message = messages[-1]
last_message_content_text_value = last_message.content[-1].text.value
last_message_content_text_value

'Here is a highly-rated green shoe you might be interested in:\n\n**Air Jordan 1 Low OG "Black/Gorge Green"**\n- **Price**: $140\n- **Materials**: Leather, Rubber\n- **Style**: Casual, Athletic\n- **Rating**: 4.9\n- **Description**: Nike Air Jordan 1 Low OG in Black and Gorge Green for Women.\n\nIf you have any other questions or need more information, feel free to ask!'

In [13]:
playback_audio(last_message_content_text_value, voice='shimmer')