In [1]:
import pandas as pd
from tqdm import tqdm
from pathlib import Path

import json
from PIL import Image

import requests
import torch
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
import mimetypes

In [2]:
url = "https://raw.githubusercontent.com/nice-bills/SnapValue/main/price-predictor/ebay_items_2023_present.csv"
df = pd.read_csv(url)
df.head(1)


Unnamed: 0,query,item_id,title,price,currency,condition,seller,feedback_score,image_url,item_url,item_creation_date
0,laptop,v1|396320750534|0,"CHUWI- 13.3"" Laptop - Intel Celeron 2.8GH- 8GB...",134.39,USD,New,chuwiofficial,413,https://i.ebayimg.com/images/g/ZysAAOSwXDpmIQc...,https://www.ebay.com/itm/396320750534?_skw=lap...,2025-03-15T01:18:10.000Z


In [3]:
CSV_FILE = "https://raw.githubusercontent.com/nice-bills/SnapValue/main/price-predictor/ebay_items_2023_present.csv"
OUTPUT_CSV = "ebay_items_with_text_embeddings.csv"
MODEL_NAME = 'all-MiniLM-L6-v2'

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", device)
model = SentenceTransformer(MODEL_NAME, device=device)
model


Running on: cpu


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [8]:
df = pd.read_csv(CSV_FILE)
if "text_embedding" not in df.columns:
    df["text_embedding"] = None

n_skipped, n_success = 0, 0
text_embeddings = []

In [9]:
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating text embeddings"):
    text = str(row.get("title", "")).strip()
    if (not text or text.lower() in ["nan", "none"]) or (pd.notna(row.get("text_embedding")) and row.get("text_embedding")):
        text_embeddings.append(row.get("text_embedding") if pd.notna(row.get("text_embedding")) else None)
        n_skipped += 1
        continue
    try:
        emb = model.encode(text)
        text_embeddings.append(json.dumps(emb.tolist()))
        n_success += 1
    except Exception as e:
        print(f"Error embedding text '{text[:30]}...': {e}")
        text_embeddings.append(None)
        n_skipped += 1

df["text_embedding"] = text_embeddings
df.to_csv(OUTPUT_CSV, index=False)
print(f"\nSaved CSV with text embeddings: {OUTPUT_CSV}")
print(f"Embeddings written: {n_success} | Skipped: {n_skipped} | Total: {len(df)}")


Generating text embeddings: 100%|██████████| 10000/10000 [01:07<00:00, 148.99it/s]



Saved CSV with text embeddings: ebay_items_with_text_embeddings.csv
Embeddings written: 10000 | Skipped: 0 | Total: 10000


In [5]:
df = pd.read_csv("ebay_items_with_text_embeddings.csv")
df.head(1)


Unnamed: 0,query,item_id,title,price,currency,condition,seller,feedback_score,image_url,item_url,item_creation_date,text_embedding
0,laptop,v1|396320750534|0,"CHUWI- 13.3"" Laptop - Intel Celeron 2.8GH- 8GB...",134.39,USD,New,chuwiofficial,413,https://i.ebayimg.com/images/g/ZysAAOSwXDpmIQc...,https://www.ebay.com/itm/396320750534?_skw=lap...,2025-03-15T01:18:10.000Z,"[-0.06598962843418121, -0.025627294555306435, ..."


In [20]:
CSV_FILE = "ebay_items_with_text_embeddings.csv"
IMAGE_FOLDER = Path("images")
IMAGE_FOLDER.mkdir(exist_ok=True)
OUTPUT_CSV = "ebay_items_with_image_paths.csv"

df = pd.read_csv(CSV_FILE)

def get_extension_from_url(url, response=None):
    ext = url.split(".")[-1].split("?")[0]
    if len(ext) > 5 or "/" in ext:
        if response is not None and "Content-Type" in response.headers:
            ext = mimetypes.guess_extension(response.headers["Content-Type"]) or ".jpg"
        else:
            ext = ".jpg"
    elif ext.lower() in ["jpeg", "jpg", "png", "webp"]:
        ext = "." + ext.lower()
    else:
        ext = ".jpg"
    return ext

def make_safe_filename(item_id, ext):
    safe_id = "".join(c if c.isalnum() else "_" for c in str(item_id))
    return safe_id + ext

def download_image(url, save_path):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            ext = get_extension_from_url(url, response)
            true_save_path = save_path.with_suffix(ext)
            with open(true_save_path, "wb") as f:
                f.write(response.content)
            return str(true_save_path)
    except Exception:
        pass
    return None

image_paths = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Downloading images"):
    url = str(row.get("image_url", ""))
    item_id = row.get("item_id", "missingid")

    if not url or pd.isna(url) or url.lower().strip() in ["none", "nan", ""]:
        image_paths.append(None)
        continue

    ext = get_extension_from_url(url)
    safe_filename = make_safe_filename(item_id, ext)
    save_path = IMAGE_FOLDER / safe_filename

    if save_path.exists():
        image_paths.append(str(save_path))
        continue

    result_path = download_image(url, save_path)
    image_paths.append(result_path)

df["image_path"] = image_paths
df.to_csv(OUTPUT_CSV, index=False)
print(f"Downloaded images and saved CSV to '{OUTPUT_CSV}'")


Downloading images: 100%|██████████| 10000/10000 [15:54<00:00, 10.47it/s]


✅ Downloaded images and saved CSV to 'ebay_items_with_image_paths.csv'


In [6]:
df = pd.read_csv("ebay_items_with_image_paths.csv")
df.head(1)

Unnamed: 0,query,item_id,title,price,currency,condition,seller,feedback_score,image_url,item_url,item_creation_date,text_embedding,image_path
0,laptop,v1|396320750534|0,"CHUWI- 13.3"" Laptop - Intel Celeron 2.8GH- 8GB...",134.39,USD,New,chuwiofficial,413,https://i.ebayimg.com/images/g/ZysAAOSwXDpmIQc...,https://www.ebay.com/itm/396320750534?_skw=lap...,2025-03-15T01:18:10.000Z,"[-0.06598962843418121, -0.025627294555306435, ...",images\v1_396320750534_0.jpg


In [7]:
CSV_FILE = "ebay_items_with_image_paths.csv"
OUTPUT_JSON = "full_embeddings.json"
BATCH_SIZE = 32

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [7]:
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)
processor

CLIPProcessor:
- image_processor: CLIPImageProcessor {
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": true,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "CLIPImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}

- tokenizer: CLIPTokenizerFast(name_or_path='openai/clip-vit-base-patch32', vocab_size=49408, model_max_length=77, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	49406: AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, spec

In [26]:
df = pd.read_csv(CSV_FILE)
image_paths = df["image_path"].dropna().unique().tolist()

# === Generate embeddings ===
embeddings_dict = {}

for i in tqdm(range(0, len(image_paths), BATCH_SIZE), desc="Generating image embeddings"):
    batch_paths = image_paths[i:i+BATCH_SIZE]
    images = []
    valid_paths = []

    # Load images with error handling
    for path in batch_paths:
        try:
            img = Image.open(path).convert("RGB")
            images.append(img)
            valid_paths.append(path)
        except Exception as e:
            print(f"Skipping {path}: {e}")

    if not images:
        continue

    # Preprocess images and move tensors to device
    inputs = processor(images=images, return_tensors="pt").to(device)

    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
        # Normalize embeddings to unit length
        image_features = torch.nn.functional.normalize(image_features, p=2, dim=1)

    # Save embeddings keyed by file path
    for path, emb in zip(valid_paths, image_features.cpu().numpy()):
        embeddings_dict[path] = emb.tolist()

# === Save embeddings as JSON ===
with open(OUTPUT_JSON, "w") as f:
    json.dump(embeddings_dict, f)

print(f"\nSaved {len(embeddings_dict)} image embeddings to '{OUTPUT_JSON}'")


Generating image embeddings: 100%|██████████| 308/308 [01:00<00:00,  5.08it/s]



Saved 9841 image embeddings to 'full_embeddings.json'


In [None]:

# === Config ===
CSV_FILE = "ebay_items_with_image_paths.csv"   # your CSV with 'image_path' column
EMBEDDINGS_FILE = "full_embeddings.json"       # JSON with image_path: embedding
OUTPUT_CSV = "ebay_items_with_full_embeddings.csv"

# === Load CSV and embeddings ===
df = pd.read_csv(CSV_FILE)

with open(EMBEDDINGS_FILE, "r") as f:
    embeddings_dict = json.load(f)

# === Map embeddings to CSV ===
image_embeddings = []

for path in df['image_path']:
    if pd.isna(path) or path not in embeddings_dict:
        image_embeddings.append(None)
    else:
        # Convert embedding list to comma-separated string
        emb_str = ",".join([str(x) for x in embeddings_dict[path]])
        image_embeddings.append(emb_str)

df['image_embedding'] = image_embeddings

# === Save updated CSV ===
df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ CSV saved with image embeddings: '{OUTPUT_CSV}'")


✅ CSV saved with image embeddings: 'ebay_items_with_full_embeddings.csv'


In [8]:
df = pd.read_csv("ebay_items_with_full_embeddings.csv")
df.head(1)

Unnamed: 0,query,item_id,title,price,currency,condition,seller,feedback_score,image_url,item_url,item_creation_date,text_embedding,image_path,image_embedding
0,laptop,v1|396320750534|0,"CHUWI- 13.3"" Laptop - Intel Celeron 2.8GH- 8GB...",134.39,USD,New,chuwiofficial,413,https://i.ebayimg.com/images/g/ZysAAOSwXDpmIQc...,https://www.ebay.com/itm/396320750534?_skw=lap...,2025-03-15T01:18:10.000Z,"[-0.06598962843418121, -0.025627294555306435, ...",images/v1_396320750534_0.jpg,"0.011063453741371632,0.05142253637313843,0.006..."
