### Import Packages

In [18]:
import os

print(os.path.exists(".."))

True


In [19]:
import sys

sys.path.append("..")

In [20]:
from dotenv import load_dotenv

load_dotenv()

True

In [21]:
from app.model_vault.embedding_model import EmbeddingModel
from app.qdrant.qdrant_service import init_qdrant, insert_qdrant

In [22]:
import pandas as pd
import re

### Download The Dataset

In [23]:
df = pd.read_csv("../dataset/train.csv")

In [24]:
len(df)

16028

In [25]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,id_right,category_right,cluster_id_right,brand_right,title_right,description_right,price_right,specTableContent_right
0,0,1,5931545,Camera_and_Photo,9309675,"""Veho""@en-US","""Veho VCC-005-MUVI-NPNG MUVI HD Mini Handsfre...","""Veho are pleased to announce the partnership ...",$0.00,
1,1,0,12577428,Camera_and_Photo,464821,,"""Sony Alpha a7 Mirrorless Digital Camera with...",,$0.00,
2,2,0,543627,Camera_and_Photo,481159,,""" Manfrotto BeFree Tripod with Ball Head ""@en...","""\n Free Advanced...",$0.00,
3,3,0,4118525,Camera_and_Photo,72951,"""UNIDEN""","""UNIDEN UDR444A 4.3"" LCD Wireless Video Surve...","""Expandable up to 4 cameras, easy to install, ...",$0.00,
4,4,0,2643276,Camera_and_Photo,801459,,"""Used GoPro Hero 4 Black w/ Batteries, Dual C...","""\n\t12 MP photos up to 30 fps\n\t4K30, 2.7K50...",$0.00,


In [26]:
def prepare_qdrant_payload(row):
    text_parts = [
        str(row["category_right"]),
        str(row["brand_right"]),
        str(row["title_right"]),
        str(row["description_right"]),
        str(row["specTableContent_right"]),
    ]

    combined_text = " ".join(text_parts).lower()

    clean_text = re.sub(r"\s+", " ", combined_text).strip()

    cleaned_text = clean_text.replace('"', "").replace("'", "")

    return cleaned_text

In [27]:
payloads = df.apply(prepare_qdrant_payload, axis=1).tolist()

In [28]:
payloads[:5]

['camera_and_photo veho@en-us veho vcc-005-muvi-npng muvi hd mini handsfree actioncam with waterproof case and 8 gb memory - no proof glory edition@en-us sports & action video cameras page 7 | come as you arts@en-us veho are pleased to announce the partnership with new and exciting lifestyle and action sports media partner no proof no glory . as part of this partnership, veho have released the muvi hd special edition no proof no glory bundle. the special edition no proof no glory muvi hd includes a waterproof case that is capable of depths of 60 meters underwater for a full 60 minutes, a helmet front mount that allows you to create a true pov angle when filming hands free.the muvi hd has updated firmware to allow you to record at 960p at 30fps and 720p at 60 & 30fps giving you more versatility with your muvi hd action camera. the muvi hd also has a 16mbit/s in 1080p mode to make sure your footage is as smooth as possible. the 1400 mah battery has the biggest capacity on the market and 

In [29]:
len(payloads)

16028

### Embedding The Details

In [30]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"

In [31]:
embedding = EmbeddingModel(model_name=model_name)

In [32]:
embedding

<app.model_vault.embedding_model.EmbeddingModel at 0x75979c6144f0>

In [33]:
embedding_text = embedding.embed(text=payloads)

Batches: 100%|██████████| 63/63 [11:16<00:00, 10.74s/it]


In [34]:
len(embedding_text)

16028

In [35]:
len(embedding_text[0])

384

In [36]:
init_qdrant(vector_size=len(embedding_text[0]))

In [37]:
metadata = df[["id_right"]].to_dict(orient="records")

In [38]:
metadata[:5]

[{'id_right': 5931545},
 {'id_right': 12577428},
 {'id_right': 543627},
 {'id_right': 4118525},
 {'id_right': 2643276}]

In [39]:
for i in range(0, len(embedding_text), 500):
    insert_qdrant(
        embeddings=embedding_text[i : i + 500], metadata=metadata[i : i + 500]
    )