In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

import pandas as pd
import openai

In [None]:
# Load the dataset we developed in the previous notebook
df_items = pd.read_json(
    "../../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl",
    lines=True,
)

In [None]:
# Display the dataset raw data
df_items.head()

In [None]:
# Let's make some preprocessing to prepare this dataset for the embedding into the vector database
# we will add into the description field the concatenation of title + features and we will place
# into the image field the first image URL available

def preprocess_description(row):
    return f"{row['title']} {' '.join(row['features'])}"


def extract_first_large_image(row):
    return row["images"][0].get("large", "")

df_items["description"] = df_items.apply(preprocess_description, axis=1)
df_items["image"] = df_items.apply(extract_first_large_image, axis=1)

In [None]:
# Display the preprocessed dataset
df_items.head()

In [None]:
# Show example of preprocessed description
list(df_items["description"].items())[0]

In [None]:
# Now sample just 50 items from the dataset for faster development processing
df_sample = df_items.sample(50, random_state=42)
# and define just a subset of the fields to be stored in the vector database
data_to_embed = df_sample[
    ["description", "image", "rating_number", "price", "average_rating", "parent_asin"]
].to_dict(orient="records")
# Show example of data to be embedded
data_to_embed

In [None]:
# Let's define the embedding model (OpenAI) to use
model = "text-embedding-3-small"
# And the embedding function
def get_embedding(text, model=model):
    response = openai.embeddings.create(
        input=text,
        model=model,
    )
    return response.data[0].embedding

In [None]:
# Before proceeding let's make a test to retrieve the size of the embedding produced 
# by the model we defined. This is important because when we create
# Qdrant client collection we need to specify the size of the vectors to be stored
test_embedding = get_embedding("This is a test embedding")
len(test_embedding)  # Should be 1536 for text-embedding-3-small

In [None]:
# Create the Qdrant client (ensure first to have Qdrant server running locally via docker compose; 
# run the command `make run-docker-compose` in the root directory, then execute this cell)
qdrant_client = QdrantClient(url="http://localhost:6333")

In [None]:
# Define the Qdrant collection to store the data_to_embed items
collection_name = "Amazon-items-collection-00"

In [None]:
# Create the Qdrant collection to store the Amazon items
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

In [None]:
# Embed th dataset data (data_to_embed) and store into Qdrant collection
pointstructs = []
for i, data in enumerate(data_to_embed):
    pointstructs.append(
        PointStruct(
            id=i,
            # we create a vector embedding for the description field
            vector=get_embedding(data["description"]),
            # we store all the data fields as payload metadata in Qdrant
            payload=data,
        )
    )

In [None]:
# Show the pointstructs to be uploaded
pointstructs

In [None]:
# Insert the points into Qdrant collection
qdrant_client.upsert(
    collection_name=collection_name,
    wait=True,
    points=pointstructs,
)

In [None]:
# Define the function for data retrieval from Qdrant based on a query text
# top K similar items will be retrieved
def retrieve_data(query, k=5):
    results = qdrant_client.query_points(
        collection_name=collection_name,
        # Notice that we are using the same embedding function to convert the query text into a vector
        query=get_embedding(query),
        limit=k,
    )
    return results

In [None]:
# Finally test the retrieval function
retrieve_data("What kind of charging cords do you offer?", k=10).points