In [3]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, PayloadSchemaType, MatchAny, Prefetch, Filter, FieldCondition, FusionQuery

import pandas as pd
import openai
import json
import tiktoken

In [4]:

import os

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    api_key = input("Please enter your OpenAI API key: ").strip()
    os.environ["OPENAI_API_KEY"] = api_key
    if not api_key:
        raise ValueError("API key is required.")

### Load AMazon Dataset (items)

In [5]:
df_items = pd.read_json("../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

In [6]:
df_items_sample = df_items.sample(50, random_state=25)

### Define functions to preprocess title and features data and xtract image URL from the first large image in the list

In [8]:
def preprocess_items_data(row):
    return f"{row['title']} {' '.join(row['features'])}"

In [7]:
def extract_first_large_image(row):
    return row['images'][0].get('large', '')

In [9]:
df_items_sample["preprocessed_data"] = df_items_sample.apply(preprocess_items_data, axis=1)
df_items_sample["first_large_image"] = df_items_sample.apply(extract_first_large_image, axis=1)


In [10]:
df_items_sample.head(2)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author,preprocessed_data,first_large_image
688,All Electronics,"Bluetooth Car Adapter, LDNIO Bluetooth FM Tran...",3.8,110,"[Fast Charging Type C, Multi Ports: USB Type C...",[],21.0,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Bluetooth FM adapter', 'url': 'htt...",LDNIO,"[Electronics, Portable Audio & Video, MP3 & MP...",{'Package Dimensions': '6.22 x 3.23 x 1.73 inc...,B09MCY26HF,,,,"Bluetooth Car Adapter, LDNIO Bluetooth FM Tran...",https://m.media-amazon.com/images/I/41gqwN41Wp...
49,Computers,"Bluetooth Multi-Device Keyboard, Dual Channel ...",4.6,199,[【Easy-Switch to 2 Devices】 Simply press the F...,"[MULTI-DEVICE BLUETOOTH KEYBOARD, SMOOTH TYPIN...",26.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Don’t do it!', 'url': 'https://www...",TechGarden,"[Electronics, Computers & Accessories, Compute...",{'Package Dimensions': '12.28 x 8.54 x 1.22 in...,B09T2J1LSB,,,,"Bluetooth Multi-Device Keyboard, Dual Channel ...",https://m.media-amazon.com/images/I/41U69pTP3j...


### Load Amazon dataset (reviews)

In [11]:
df_reviews = pd.read_json("../data/Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

In [12]:
len(df_reviews)

105918

In [13]:
df_reviews_sample = df_reviews[df_reviews['parent_asin'].isin(df_items_sample['parent_asin'])]

In [14]:
df_reviews_sample.head(2)

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
18,5,So comfortable you can sleep in them,So comfortable that I even sleep with them eve...,[],B082D8VGMQ,B098K6N6TX,AFCV25KA7XSAJGTZRDML5B7UYOVQ,2020-10-04 03:27:23.549,1,True
40,5,Less filter choices does not always mean less ...,So I have a few ND filter sets for my mini 3 f...,[],B0B58CW7FK,B0C5HKTYVC,AHR4BEXIRURKL3POXW5RJFJLJDQQ,2022-10-26 16:14:29.401,3,False


### Define functions to preprocess reviews data

In [15]:
def preprocess_reviews_data(row):
    return f"{row['title']} {row['text']}"

In [16]:
def token_count(row, model="text-embedding-3-small"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(row['preprocessed_data']))

In [17]:
df_reviews_sample["preprocessed_data"] = df_reviews_sample.apply(preprocess_reviews_data, axis=1)
df_reviews_sample["preprocessed_data_token_count"] = df_reviews_sample.apply(token_count, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_sample["preprocessed_data"] = df_reviews_sample.apply(preprocess_reviews_data, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_sample["preprocessed_data_token_count"] = df_reviews_sample.apply(token_count, axis=1)


In [18]:
df_reviews_sample = df_reviews_sample[df_reviews_sample["preprocessed_data_token_count"] < 8192]

In [19]:
df_reviews_sample.head(2)

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,preprocessed_data,preprocessed_data_token_count
18,5,So comfortable you can sleep in them,So comfortable that I even sleep with them eve...,[],B082D8VGMQ,B098K6N6TX,AFCV25KA7XSAJGTZRDML5B7UYOVQ,2020-10-04 03:27:23.549,1,True,So comfortable you can sleep in them So comfor...,36
40,5,Less filter choices does not always mean less ...,So I have a few ND filter sets for my mini 3 f...,[],B0B58CW7FK,B0C5HKTYVC,AHR4BEXIRURKL3POXW5RJFJLJDQQ,2022-10-26 16:14:29.401,3,False,Less filter choices does not always mean less ...,161


### Create a new Qdrant collection for items

In [21]:
qdrant_client = QdrantClient(url="http://localhost:6333")

qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02-items",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)

True

In [22]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-items",
    field_name="text",
    field_schema=PayloadSchemaType.TEXT
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [23]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-items",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

### Create a new Qdrant collection for reviews

In [24]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02-reviews",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)

True

In [25]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-reviews",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

### Embedding funtcions

In [26]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [27]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]
    
    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1
        
    return all_embeddings

### Embed the text data and add additional fields to the payload of each vector (items)

In [28]:
data_to_embed_items = df_items_sample[["preprocessed_data", "first_large_image", "rating_number", "price", "average_rating", "parent_asin"]].to_dict(orient="records")


In [29]:
text_to_embed_items = [data["preprocessed_data"] for data in data_to_embed_items]


In [30]:
embeddings_items = get_embeddings_batch(text_to_embed_items)


In [32]:
len(embeddings_items)

50

In [33]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings_items, data_to_embed_items):
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"],
                "first_large_image": data["first_large_image"],
                "average_rating": data["average_rating"],
                "rating_number": data["rating_number"],
                "price": data["price"],
                "parent_asin": data["parent_asin"],
            }
        )
    )
    i += 1

In [34]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-02-items",
    wait=True,
    points=pointstructs
)

UpdateResult(operation_id=4, status=<UpdateStatus.COMPLETED: 'completed'>)

### Embed the text data and add additional fields to the payload of each vector (reviews)


In [35]:
data_to_embed_reviews = df_reviews_sample[["preprocessed_data", "parent_asin"]].to_dict(orient="records")


In [36]:
text_to_embed_reviews = [data["preprocessed_data"] for data in data_to_embed_reviews]


In [37]:
embeddings_reviews = get_embeddings_batch(text_to_embed_reviews)


Processed 100 of 6038
Processed 200 of 6038
Processed 300 of 6038
Processed 400 of 6038
Processed 500 of 6038
Processed 600 of 6038
Processed 700 of 6038
Processed 800 of 6038
Processed 900 of 6038
Processed 1000 of 6038
Processed 1100 of 6038
Processed 1200 of 6038
Processed 1300 of 6038
Processed 1400 of 6038
Processed 1500 of 6038
Processed 1600 of 6038
Processed 1700 of 6038
Processed 1800 of 6038
Processed 1900 of 6038
Processed 2000 of 6038
Processed 2100 of 6038
Processed 2200 of 6038
Processed 2300 of 6038
Processed 2400 of 6038
Processed 2500 of 6038
Processed 2600 of 6038
Processed 2700 of 6038
Processed 2800 of 6038
Processed 2900 of 6038
Processed 3000 of 6038
Processed 3100 of 6038
Processed 3200 of 6038
Processed 3300 of 6038
Processed 3400 of 6038
Processed 3500 of 6038
Processed 3600 of 6038
Processed 3700 of 6038
Processed 3800 of 6038
Processed 3900 of 6038
Processed 4000 of 6038
Processed 4100 of 6038
Processed 4200 of 6038
Processed 4300 of 6038
Processed 4400 of 60

In [38]:
len(embeddings_reviews)

6038

In [39]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings_reviews, data_to_embed_reviews):
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"],
                "parent_asin": data["parent_asin"],
            }
        )
    )
    i += 1

In [40]:
batch_size_qdrant = 100
counter = 1
for i in range(0, len(pointstructs), batch_size_qdrant):
    batch = pointstructs[i:i + batch_size_qdrant]
    qdrant_client.upsert(
        collection_name="Amazon-items-collection-02-reviews",
        wait=True,
        points=batch
    )
    print(f"Processed {counter * batch_size_qdrant} of {len(pointstructs)}")
    counter += 1

Processed 100 of 6038
Processed 200 of 6038
Processed 300 of 6038
Processed 400 of 6038
Processed 500 of 6038
Processed 600 of 6038
Processed 700 of 6038
Processed 800 of 6038
Processed 900 of 6038
Processed 1000 of 6038
Processed 1100 of 6038
Processed 1200 of 6038
Processed 1300 of 6038
Processed 1400 of 6038
Processed 1500 of 6038
Processed 1600 of 6038
Processed 1700 of 6038
Processed 1800 of 6038
Processed 1900 of 6038
Processed 2000 of 6038
Processed 2100 of 6038
Processed 2200 of 6038
Processed 2300 of 6038
Processed 2400 of 6038
Processed 2500 of 6038
Processed 2600 of 6038
Processed 2700 of 6038
Processed 2800 of 6038
Processed 2900 of 6038
Processed 3000 of 6038
Processed 3100 of 6038
Processed 3200 of 6038
Processed 3300 of 6038
Processed 3400 of 6038
Processed 3500 of 6038
Processed 3600 of 6038
Processed 3700 of 6038
Processed 3800 of 6038
Processed 3900 of 6038
Processed 4000 of 6038
Processed 4100 of 6038
Processed 4200 of 6038
Processed 4300 of 6038
Processed 4400 of 60

### Perform hybrid search and perform rrf rank fusion on the retrieved results


In [41]:
from qdrant_client.models import Prefetch, Filter, FieldCondition, MatchText, FusionQuery

def retrieve_data(query, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-02-items",
        prefetch=[
            Prefetch(
                query=query_embedding,
                limit=20
            ),
            Prefetch(
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="text",
                            match=MatchText(text=query)
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

In [42]:
result = retrieve_data("earphones")


In [43]:
result.points


[ScoredPoint(id=3, version=4, score=1.0, payload={'text': 'Active Noise Cancelling Wireless Earbuds, Bluetooth in-Ear Headphones Built-in 4 Mic ENC Call, Deep Bass Ear Buds,IPX6 Waterproof Stereo Earphones for iPhone,Samsung,Laptop (Blue) ♪【ANC Noise Cancelling&Game Mode】: These premium noise-canceling earbuds adopts professional audio active noise canceling technology, and the binaural microphone blocks environmental noise and enhances the voice experience. M48 wireless earbuds are built to provide the exceptional mobile gaming audio experience with an ultra-low 45ms latency.Taps 5 times to enter game mode,through quick sound effect and ideal gaming audio you never miss any game sound effects. ♪【HiFi Stereo Sound & Superior Clear Call 】: Heaphones equipped with a high-sensitivity diaphragm and dual 13mm drivers to restore audio authenticity and voice dynamics, crystal clarity and deep resonance bass characteristics enable you to enjoy immersive sound quality. Call noise canceling brin

In [44]:
parent_asins = []
for data in result.points:
    parent_asins.append(data.payload["parent_asin"])

In [45]:
parent_asins


['B09NLTDHQ6', 'B098K6N6TX', 'B0C6KBJMHP', 'B0B1DM4Y5C', 'B09Q5W9HPQ']

### A function to run search agains reviews on a prefiltered set of product IDs


In [46]:
def retrieve_prefiltered_reviews_data(query, parent_asins, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-02-reviews",
        prefetch=[
            Prefetch(
                query=query_embedding,
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="parent_asin",
                            match=MatchAny(
                                any=parent_asins
                            )
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

In [47]:
reviews = retrieve_prefiltered_reviews_data("bad quality", ['B09NLTDHQ6', 'B098K6N6TX', 'B0C6KBJMHP', 'B0B1DM4Y5C', 'B09Q5W9HPQ'], k=5)


In [48]:
reviews.points


[ScoredPoint(id=2767, version=29, score=0.5, payload={'text': 'sound quality is bad skip this one, sound is bad.', 'parent_asin': 'B09Q5W9HPQ'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=1040, version=12, score=0.33333334, payload={'text': 'Poor sound quality Very poor low sound-returning', 'parent_asin': 'B098K6N6TX'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=5668, version=58, score=0.25, payload={'text': 'Sound quality is fuzzy Didn’t like how stuff sounds on these, also the mic isn’t that good either but it works I guess..', 'parent_asin': 'B0C6KBJMHP'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=4745, version=49, score=0.2, payload={'text': 'Do not buy You get what you pay for. Low quality. Trouble connecting. Piece fell off after one week of use. I do not recommend.', 'parent_asin': 'B09Q5W9HPQ'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=5850, version=60, score=0.16666667, payload={'text': 'No