## Imports

In [None]:
from sentence_transformers import SentenceTransformer
from PIL import Image, ImageOps
import torch
import torch.nn.functional as F
from transformers import SiglipProcessor, SiglipModel
import requests
from io import BytesIO
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
import os
from dotenv import load_dotenv
import pandas as pd
import uuid
from tqdm import tqdm
from qdrant_client.models import Batch

load_dotenv()

## E5 Model

In [None]:

e5_model = SentenceTransformer("intfloat/e5-large-v2")

def e5_text_embeddings(text: str) -> list[float]:
    """
    Generate E5 embedding for a single text string.

    The input will be prefixed with 'query:' as required for query encoding.
    Output is a normalized vector (list of floats).
    """
    text = str(text).strip()
    if not text or not text.strip():
        return [0.0] * e5_model.get_sentence_embedding_dimension()

    text = f"query: {text.strip()}"
    emb = e5_model.encode(text, normalize_embeddings=True)
    return emb.tolist()

## Function for Padding Images for SigLIP model

In [None]:

def pad_to_square(img: Image.Image, fill_color=(255, 255, 255)) -> Image.Image:
    w, h = img.size                             
    size = max(w, h)                           
    return ImageOps.pad(
        img, 
        (size, size),                           
        color=fill_color,                       
        centering=(0.5, 0.5)                    
    )


## SigLIP Model

In [None]:

_siglip_name = "google/siglip-so400m-patch14-384" 
_siglip_device = "cpu"
_siglip_processor = SiglipProcessor.from_pretrained(_siglip_name)
_siglip_model = SiglipModel.from_pretrained(_siglip_name).to(_siglip_device).eval()

# Run a dummy forward pass once to infer the correct embedding dimension
with torch.no_grad():
    dummy = Image.new("RGB", (384, 384), color="white")
    dummy_inputs = _siglip_processor(images=dummy, text="dummy", return_tensors="pt").to(_siglip_device)
    dummy_output = _siglip_model(**dummy_inputs)
    _siglip_dim = dummy_output.image_embeds.shape[-1]

def siglip_image_embedding(image_path: str) -> list[float]:
    """
    Compute a SigLIP image embedding from a local image path.
    Returns an L2-normalized vector (list[float]).
    """
    try:

        img = Image.open(image_path).convert("RGB")
        img = pad_to_square(img)
        inputs = _siglip_processor(images=img, return_tensors="pt").to(_siglip_device)

        with torch.no_grad():
            vec = _siglip_model.get_image_features(**inputs)  
            vec = F.normalize(vec, p=2, dim=-1)

        return vec.squeeze(0).cpu().tolist()
    except Exception as e:
        print(e)
        return [0.0] * _siglip_dim

def siglip_image_from_url(url: str) -> list[float]:
    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img = pad_to_square(img)
        inputs = _siglip_processor(images=img, return_tensors="pt").to(_siglip_device)
        with torch.no_grad():
            vec = _siglip_model.get_image_features(**inputs)  
            vec = F.normalize(vec, p=2, dim=-1)
        return vec.squeeze(0).cpu().tolist()
    except Exception as e:
        print(e)
        return [0.0] * _siglip_dim


## Qdrant Setup

In [None]:

qdrant = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
    
)

COLLECTION_NAME = "products"


qdrant.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        "image_cropped": VectorParams(size=1152, distance=Distance.COSINE),
        "image_original": VectorParams(size=1152, distance=Distance.COSINE),
        "text_gen_desc": VectorParams(size=1024, distance=Distance.COSINE),
        "text_raw_desc": VectorParams(size=1024, distance=Distance.COSINE),
    }
)


## Inserting Data

In [None]:

df = pd.read_csv("hedomak_products.csv")


points = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    point_id = str(uuid.uuid4())


    brand = row.get("brand", "")
    title = row.get("title", "")
    product_url = row.get("product_url", "")
    regular_price = row.get("regular_price","")
    sale_price = row.get("sale_price","")
    raw_desc = row.get("description", "")
    sizes = row.get("sizes","")
    available = row.get("available","")
    gen_desc = row.get("generated_desc", "")
    img_local = row.get("cropped_image_path", "")
    img_url = row.get("image_url", "")

    # Embeddings (functions handle failures internally)
    vec_text_raw = e5_text_embeddings(raw_desc)
    vec_text_gen = e5_text_embeddings(gen_desc)
    vec_image_cropped = siglip_image_embedding(img_local)
    vec_image_original = siglip_image_from_url(img_url)

    # Sanitize payload (replace NaNs with None)
    payload = {
        "brand":brand,
        "title": title,
        "product_url":product_url,
        "regular_price":regular_price,
        "sale_price":sale_price,
        "description":raw_desc,
        "sizes":sizes,
        "available":available,
        "image_url": img_url,
    }
    payload = {k: (None if pd.isna(v) else v) for k, v in payload.items()}

    # Build Qdrant point
    point = PointStruct(
        id=point_id,
        vector={
            "text_raw_desc": vec_text_raw,
            "text_gen_desc": vec_text_gen,
            "image_cropped": vec_image_cropped,
            "image_original": vec_image_original,
        },
        payload=payload,
    )

    points.append(point)
    

print(f"{len(points)} to upload...")




In [None]:
chunk_size = 64

for i in range(0, len(points), chunk_size):
    try:
        qdrant.upsert(
            collection_name="products",
            points=points[i:i + chunk_size] 
        )
    except Exception as e:
        print(f"Error Upserting Batch {i // chunk_size + 1}: {e}")