In [1]:
import os
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import pathlib

In [2]:
INPUT_ROOT = "./success"          # your folder with ISBN subfolders
OUTPUT_DIR = "./image_embeddings" # where to save embeddings
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# Use ResNet50 pretrained on ImageNet (you can change to resnet18, resnet101, etc.)
model = models.resnet50(pretrained=True)
# Remove the final classification layer to get 2048-dim embeddings
model = nn.Sequential(*list(model.children())[:-1])
model.eval()

# If you have a GPU, uncomment the next line
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")  # change to "cuda" if you want GPU
model = model.to(device)



In [4]:
# Standard ImageNet normalization
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# ====================== Extraction Function ======================
def get_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0).to(device)  # add batch dimension

    with torch.no_grad():
        embedding = model(input_batch)
        embedding = torch.flatten(embedding, 1)  # (1, 2048)
    return embedding.cpu()  # return as CPU tensor

In [5]:
# ====================== Main Loop ======================
root_path = pathlib.Path(INPUT_ROOT)

# Look for folders that look like ISBNs (13 digits, sometimes starting with 978/979)
for isbn_folder in root_path.iterdir():
    if not isbn_folder.is_dir():
        continue
    
    isbn = isbn_folder.name
    if not isbn.isdigit() or len(isbn) != 13:
        print(f"Skipping non-ISBN folder: {isbn}")
        continue

    jpg_path = isbn_folder / f"{isbn}.jpg"
    
    if not jpg_path.exists():
        print(f"Warning: {jpg_path} not found, skipping...")
        continue

    print(f"Processing {isbn} ...")
    try:
        embedding = get_embedding(jpg_path)                     # (1, 2048)
        output_path = pathlib.Path(OUTPUT_DIR) / f"{isbn}.pt"
        torch.save(embedding, output_path)
        print(f"  â†’ Saved embedding to {output_path}")
    except Exception as e:
        print(f"  Failed to process {isbn}: {e}")

print("\nAll done! Embeddings saved in:", OUTPUT_DIR)

FileNotFoundError: [Errno 2] No such file or directory: 'success'