In [24]:

import imagehash, PIL.Image as Image, pathlib
import shutil
import os

output_dir = "ham20"
os.makedirs(output_dir, exist_ok=True)

def phash(path):
    return imagehash.phash(Image.open(path))

img_dir = pathlib.Path('/home/mpgetz/nas/bee_cam_slicing/tiled_prune_bg_050725/train/background')
img_paths = sorted(img_dir.glob('*.jpg'))

hash_func    = imagehash.phash 
hash_size    = 16 

hashes = [] 
for path in img_paths:
    img = Image.open(path).convert('L')          
    h   = hash_func(img, hash_size=hash_size)     
    hashes.append((path, h))

HAMMING_THRESH = 20

clusters = [] 
for path, h in hashes:
    placed = False
    for cluster in clusters:
        if (h - cluster['rep_hash']) <= HAMMING_THRESH:
            cluster['paths'].append(path)
            placed = True
            break
    if not placed:
        clusters.append({'rep_hash': h, 'paths': [path]})

MAX_PER_CLUSTER = 1

to_keep = []
to_discard = []

for cluster in clusters:
    keep = cluster['paths'][:MAX_PER_CLUSTER]
    drop = cluster['paths'][MAX_PER_CLUSTER:]
    to_keep.extend(keep)
    to_discard.extend(drop)

print(f"Kept {len(to_keep)} tiles; pruned {len(to_discard)} duplicates.")

# for path in to_keep:
#     filename = os.path.basename(path)
#     shutil.copy2(path, os.path.join(output_dir, filename))

KeyboardInterrupt: 

In [25]:
import imagehash, PIL.Image as Image, pathlib
import shutil
import os
from concurrent.futures import ProcessPoolExecutor

output_dir = "ham20"
os.makedirs(output_dir, exist_ok=True)

def compute_hash(path):
    try:
        img = Image.open(path).convert('L')
        return (path, imagehash.phash(img, hash_size=16))
    except Exception as e:
        print(f"Error hashing {path}: {e}")
        return (path, None)

img_dir = pathlib.Path('/home/mpgetz/nas/bee_cam_slicing/tiled_prune_bg_050725/train/background')
img_paths = sorted(img_dir.glob('*.jpg'))

with ProcessPoolExecutor() as executor:
    results = list(executor.map(compute_hash, img_paths))

hashes = [(p, h) for p, h in results if h is not None]

HAMMING_THRESH = 20
clusters = []

for path, h in hashes:
    placed = False
    for cluster in clusters:
        if (h - cluster['rep_hash']) <= HAMMING_THRESH:
            cluster['paths'].append(path)
            placed = True
            break
    if not placed:
        clusters.append({'rep_hash': h, 'paths': [path]})

MAX_PER_CLUSTER = 1
to_keep = []
to_discard = []

for cluster in clusters:
    keep = cluster['paths'][:MAX_PER_CLUSTER]
    drop = cluster['paths'][MAX_PER_CLUSTER:]
    to_keep.extend(keep)
    to_discard.extend(drop)

print(f"Kept {len(to_keep)} tiles; pruned {len(to_discard)} duplicates.")

# Optional copy
# for path in to_keep:
#     filename = os.path.basename(path)
#     shutil.copy2(path, os.path.join(output_dir, filename))


KeyboardInterrupt: 