# Image Feature Loader & DB-Saver
Dieses Notebook lädt Bilder, extrahiert Features und speichert sie in einer SQLite-Datenbank.

In [1]:
import os
import time
import sqlite3
import numpy as np
from pathlib import Path
from datetime import datetime
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import cv2
from PIL import Image
from loader import image_generator
from features.hash import calc_hash
from features.color_vec import calc_histogram
from features.embedding_vec import extract_embeddings
from image_load import fast_load
import traceback

In [3]:
# ========== CONFIG ==========
DB_PATH = r"Z:\fork_repo_local\data\database.db"
PHOTO_FOLDER = r"D:\Test_bilder_verschachtelt"
EMBEDDING_DIR = r"Z:\embeddings"
BATCH_SIZE = 128
MAX_WORKERS = os.cpu_count() - 1
TABLE_NAME = "image_features_test"
LOG_FILE = "verarbeitung_log.txt"

Path(EMBEDDING_DIR).mkdir(parents=True, exist_ok=True)

# ========== DB ==========
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("PRAGMA journal_mode=OFF;")
cursor.execute("PRAGMA synchronous=OFF;")

def save_batch_to_db(entries):
    cursor.executemany(f"""
        INSERT OR REPLACE INTO {TABLE_NAME}
        (filename, path, color_hist, embedding_path, image_hash, resolution, file_size,
         category, photographer, umap_x, umap_y)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """, entries)
    conn.commit()

# ========== Feature-Extractor Threaded ==========
def prepare_image_features(filename, path):
    try:
        img = fast_load(path)
        color_hist = calc_histogram(img)
        img_hash = calc_hash(img)
        resolution = f"{img.shape[1]}x{img.shape[0]}"
        file_size = os.path.getsize(path)
        return (filename, path, img, color_hist, img_hash, resolution, file_size)
    except Exception as e:
        tb = traceback.format_exc()
        return (filename, path, None, None, None, None, None, f"{e} | Traceback:\n{tb}")

# ========== MAIN ==========
def main():
    image_paths = list(image_generator(PHOTO_FOLDER))
    logfile = open(LOG_FILE, "a", encoding="utf-8")

    batch_meta = []
    batch_images = []
    batch_entries = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(prepare_image_features, fname, path) for fname, path in image_paths]

        for future in tqdm(as_completed(futures), total=len(futures), desc="Verarbeitung", unit="Bild"):
            result = future.result()
            if len(result) == 8:  # Fehlerfall
                fname, path, *_ , err = result
                logfile.write(f"{fname} ❌ {err}\n")
                continue

            filename, path, img, color_hist, img_hash, resolution, size = result
            batch_meta.append((filename, path, color_hist, img_hash, resolution, size))
            batch_images.append(img)

            if len(batch_images) >= BATCH_SIZE:
                try:
                    embs = extract_embeddings(batch_images)
                    for meta, emb in zip(batch_meta, embs):
                        filename, path, hist, img_hash, resolution, size = meta
                        hist_str = ",".join([str(round(v, 6)) for v in hist])
                        emb_path = os.path.join(EMBEDDING_DIR, f"{filename}.npy")
                        np.save(emb_path, emb)
                        batch_entries.append((filename, path, hist_str, emb_path, img_hash, resolution, size, None, None, None, None))
                    save_batch_to_db(batch_entries)
                    logfile.write(f"{len(batch_entries)} ✓ gespeichert\n")
                except Exception as e:
                    logfile.write(f"❌ Fehler im Batch: {e}\n")

                batch_meta.clear()
                batch_images.clear()
                batch_entries.clear()

    # letzter Batch
    if batch_images:
        try:
            start = time.time()
            embs = extract_embeddings(batch_images)
            duration = time.time() - start
            logfile.write(f"🕒 Letzter Batch-Embedding dauerte: {duration:.2f} Sekunden\n")
            for meta, emb in zip(batch_meta, embs):
                filename, path, hist, img_hash, resolution, size = meta
                hist_str = ",".join([str(round(v, 6)) for v in hist])
                emb_path = os.path.join(EMBEDDING_DIR, f"{filename}.npy")
                np.save(emb_path, emb)
                batch_entries.append((filename, path, hist_str, emb_path, img_hash, resolution, size, None, None, None, None))
            save_batch_to_db(batch_entries)
            logfile.write(f"{len(batch_entries)} ✓ letzter Batch gespeichert\n")
        except Exception as e:
            logfile.write(f"❌ Fehler im letzten Batch: {e}\n")

    logfile.write(f"[{datetime.now()}] ✓ Fertig\n")
    logfile.close()
    conn.close()

if __name__ == "__main__":
    main()

Verarbeitung: 100%|██████████| 158/158 [00:10<00:00, 15.72Bild/s]


In [3]:
import cProfile

cProfile.run('main()')


Verarbeitung: 100%|██████████| 158/158 [00:09<00:00, 16.60Bild/s]


         54476 function calls (54160 primitive calls) in 10.458 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.000    0.000 1107912199.py:18(save_batch_to_db)
        1    0.021    0.021   10.128   10.128 1107912199.py:41(main)
        1    0.000    0.000    0.056    0.056 1107912199.py:50(<listcomp>)
      128    0.016    0.000    0.105    0.001 1107912199.py:68(<listcomp>)
       30    0.003    0.000    0.019    0.001 1107912199.py:90(<listcomp>)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1053(_handle_fromlist)
        1    0.330    0.330   10.458   10.458 <string>:1(<module>)
       40    0.000    0.000    0.000    0.000 __init__.py:38(__get__)
        1    0.000    0.000    0.000    0.000 __init__.py:48(create_string_buffer)
        1    0.000    0.000    0.000    0.000 _base.py:146(__init__)
        1    0.000    0.000    0.000    0.000 _base.py:149(__e