# Image Feature Loader & DB-Saver
Dieses Notebook lädt Bilder, extrahiert Features und speichert sie in einer SQLite-Datenbank.

In [15]:
import os
import time
import sqlite3
import numpy as np
from pathlib import Path
from datetime import datetime
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import cv2
from PIL import Image
from loader import image_generator
from features.hash import calc_hash
from features.color_vec import calc_histogram
from features.embedding_vec import extract_embeddings
from image_load import fast_load
import traceback
from sklearn.decomposition import PCA
import umap
import cProfile
import pstats


In [16]:

# ========== CONFIG ==========
DB_PATH = r"Z:\CODING\UNI\BIG_DATA\data\database.db"
PHOTO_FOLDER = r"D:\Test_bilder_verschachtelt"
EMBEDDING_DIR = r"Z:\CODING\UNI\BIG_DATA\embeddings"
EMBEDDING_PCA_DIR = r"Z:\CODING\UNI\BIG_DATA\embeddings_pca"  # <<< HINZUGEFÜGT
BATCH_SIZE = 1024
MAX_WORKERS = os.cpu_count() - 1
TABLE_NAME = "image_features_test"
LOG_FILE = "verarbeitung_log.txt"

Path(EMBEDDING_DIR).mkdir(parents=True, exist_ok=True)
Path(EMBEDDING_PCA_DIR).mkdir(parents=True, exist_ok=True)  # <<< HINZUGEFÜGT

# ========== DB ==========
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("PRAGMA journal_mode=OFF;")
cursor.execute("PRAGMA synchronous=OFF;")

def save_batch_to_db(entries):
    cursor.executemany(f"""
        INSERT OR REPLACE INTO {TABLE_NAME}
        (filename, path, color_hist, embedding_path, image_hash, resolution, file_size,
         category, photographer, pca_embedding, umap_x, umap_y)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """, entries)
    conn.commit()

# ========== Feature-Extractor Threaded ==========
def prepare_image_features(filename, path):
    try:
        img = fast_load(path)
        color_hist = calc_histogram(img)
        img_hash = calc_hash(img)
        resolution = f"{img.shape[1]}x{img.shape[0]}"
        file_size = os.path.getsize(path)
        return (filename, path, img, color_hist, img_hash, resolution, file_size)
    except Exception as e:
        tb = traceback.format_exc()
        return (filename, path, None, None, None, None, None, f"{e} | Traceback:\n{tb}")

# ========== MAIN ==========
def main():
    image_paths = list(image_generator(PHOTO_FOLDER))
    logfile = open(LOG_FILE, "a", encoding="utf-8")

    batch_meta = []
    batch_images = []
    batch_entries = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(prepare_image_features, fname, path) for fname, path in image_paths]

        for future in tqdm(as_completed(futures), total=len(futures), desc="Verarbeitung", unit="Bild"):
            result = future.result()
            if len(result) == 8:  # Fehlerfall
                fname, path, *_ , err = result
                logfile.write(f"{fname} ❌ {err}\n")
                continue

            filename, path, img, color_hist, img_hash, resolution, size = result
            batch_meta.append((filename, path, color_hist, img_hash, resolution, size))
            batch_images.append(img)

            if len(batch_images) >= BATCH_SIZE:
                process_batch(batch_meta, batch_images, batch_entries, logfile)
                batch_meta.clear()
                batch_images.clear()
                batch_entries.clear()

    # letzter Batch
    if batch_images:
        process_batch(batch_meta, batch_images, batch_entries, logfile, final=True)

    logfile.write(f"[{datetime.now()}] ✓ Fertig\n")
    logfile.close()
    conn.close()

# ========== Batch Processing mit PCA + UMAP ==========
def process_batch(batch_meta, batch_images, batch_entries, logfile, final=False):
    try:
        embs = extract_embeddings(batch_images)
        pca = PCA(n_components=100)
        embs_pca = pca.fit_transform(embs)

        reducer = umap.UMAP(n_components=2, n_jobs=-1, random_state=None)
        coords = reducer.fit_transform(embs_pca)

        for meta, emb, emb_pca, (x, y) in zip(batch_meta, embs, embs_pca, coords):
            filename, path, hist, img_hash, resolution, size = meta
            hist_str = ",".join([str(round(v, 6)) for v in hist])

            # Speichern Original-Embedding
            emb_path = os.path.join(EMBEDDING_DIR, f"{filename}.npy")
            np.save(emb_path, emb)

            # Speichern PCA-Embedding
            pca_path = os.path.join(EMBEDDING_PCA_DIR, f"{filename}_pca.npy")
            np.save(pca_path, emb_pca)

            # In DB einfügen
            batch_entries.append((
                filename, path, hist_str, emb_path, img_hash, resolution, size,
                None, None, pca_path, float(x), float(y)
            ))

        save_batch_to_db(batch_entries)
        msg = "letzter Batch gespeichert" if final else f"{len(batch_entries)} ✓ gespeichert"
        logfile.write(f"{msg}\n")
    except Exception as e:
        logfile.write(f"❌ Fehler im Batch: {e}\n")









# ========== Profiling + Ausführung ==========
if __name__ == "__main__":
    with cProfile.Profile() as pr:
        main()

    # Textausgabe
    with open("profiling_results.txt", "w") as f:
        stats = pstats.Stats(pr, stream=f)
        stats.sort_stats("cumtime").print_stats()  # Alle Funktionen

    # Binärspeicherung
    stats.dump_stats("profiling_results.prof")

Verarbeitung: 100%|██████████| 258/258 [00:10<00:00, 25.19Bild/s] 
