In [4]:
import numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
import shutil

data = np.load("/workspace/yolo_dangerous_weapons/embeddings.npz", allow_pickle=True)

embeddings = data['embeddings']
paths = data['paths']
labels = data['labels']
splits = data['splits']

print(f"Embeddings shape: {embeddings.shape}")
print(f"Total images: {len(paths)}")
print(f"Example path: {paths[0]}")


Embeddings shape: (17411, 1024)
Total images: 17411
Example path: /workspace/yolo_dataset_4_dec/images/test/cellphone_test_002379.png


In [5]:
problematic_md = Path("/workspace/yolo_dangerous_weapons/outliers/problematic.md")

problematic_images = []
with open(problematic_md, 'r') as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith('#'):
            problematic_images.append(line)

print(f"Found {len(problematic_images)} problematic images:")
for img in problematic_images[:5]:
    print(f"  - {img}")


Found 18 problematic images:
  - voc_train_005576.jpg
  - voc_train_007141.jpg
  - voc_valid_001682.jpg
  - voc_train_007968.jpg
  - voc_valid_000964.jpg


In [6]:
def extract_image_name(path_str):
    """Extract image name from full path."""
    return Path(path_str).name

path_to_idx = {extract_image_name(p): i for i, p in enumerate(paths)}

print(f"Created path index with {len(path_to_idx)} entries")
print(f"Example: {list(path_to_idx.keys())[0]}")


Created path index with 17411 entries
Example: cellphone_test_002379.png


In [7]:
def find_top_similar(query_idx, embeddings, top_k=20):
    """Find top K most similar images to query image."""
    query_emb = embeddings[query_idx].reshape(1, -1)
    similarities = cosine_similarity(query_emb, embeddings)[0]
    
    similar_indices = np.argsort(similarities)[::-1]
    similar_indices = similar_indices[similar_indices != query_idx][:top_k]
    
    results = []
    for idx in similar_indices:
        results.append({
            'idx': idx,
            'path': paths[idx],
            'similarity': similarities[idx],
            'label': labels[idx],
            'split': splits[idx]
        })
    return results

test_idx = 0
test_results = find_top_similar(test_idx, embeddings, top_k=5)
print(f"Test: Top 5 similar to {paths[test_idx]}:")
for r in test_results:
    print(f"  {r['similarity']:.4f} - {Path(r['path']).name}")


Test: Top 5 similar to /workspace/yolo_dataset_4_dec/images/test/cellphone_test_002379.png:
  0.9951 - cellphone_train_013499.png
  0.9698 - cellphone_train_013222.png
  0.9650 - cellphone_train_013161.png
  0.9594 - cellphone_train_013263.png
  0.9576 - cellphone_valid_002718.png


In [8]:
OUTPUT_BASE = Path("/workspace/problematic_similar_images")
if OUTPUT_BASE.exists():
    shutil.rmtree(OUTPUT_BASE)
OUTPUT_BASE.mkdir(parents=True)

all_results = {}
not_found = []

for prob_img in problematic_images:
    if prob_img in path_to_idx:
        idx = path_to_idx[prob_img]
        similar = find_top_similar(idx, embeddings, top_k=20)
        all_results[prob_img] = similar
        print(f"Found {len(similar)} similar images for {prob_img}")
    else:
        not_found.append(prob_img)
        print(f"NOT FOUND: {prob_img}")

print(f"\nProcessed: {len(all_results)} images")
print(f"Not found: {len(not_found)} images")


Found 20 similar images for voc_train_005576.jpg
Found 20 similar images for voc_train_007141.jpg
Found 20 similar images for voc_valid_001682.jpg
Found 20 similar images for voc_train_007968.jpg
Found 20 similar images for voc_valid_000964.jpg


Found 20 similar images for youtube_train_008637.jpg
Found 20 similar images for dangerous_train_004347.jpg
Found 20 similar images for voc_train_007874.jpg
Found 20 similar images for crowdhuman_train_012069.jpg
Found 20 similar images for voc_train_007896.jpg
Found 20 similar images for voc_train_004498.jpg
Found 20 similar images for voc_train_005823.jpg
Found 20 similar images for voc_valid_000978.jpg
Found 20 similar images for voc_test_001110.jpg
Found 20 similar images for voc_train_005801.jpg
Found 20 similar images for voc_train_008012.jpg
Found 20 similar images for voc_train_006651.jpg
Found 20 similar images for voc_test_001499.jpg

Processed: 18 images
Not found: 0 images


In [9]:
yolo_dataset = Path("/workspace/yolo_dataset_4_dec")
copied_count = 0
missing_count = 0

for prob_img, similar_list in all_results.items():
    prob_name = Path(prob_img).stem
    folder = OUTPUT_BASE / prob_name
    folder.mkdir(exist_ok=True)
    
    prob_idx = path_to_idx[prob_img]
    prob_source = Path(paths[prob_idx])
    if prob_source.exists():
        shutil.copy2(prob_source, folder / f"000_QUERY_{prob_img}")
        copied_count += 1
    
    for i, sim_item in enumerate(similar_list, 1):
        source_path = Path(sim_item['path'])
        if source_path.exists():
            sim_score = sim_item['similarity']
            dest_name = f"{i:03d}_sim{sim_score:.3f}_{source_path.name}"
            shutil.copy2(source_path, folder / dest_name)
            copied_count += 1
        else:
            missing_count += 1

print(f"Copied {copied_count} images")
print(f"Missing {missing_count} images")
print(f"Created {len(all_results)} folders in {OUTPUT_BASE}")


Copied 378 images
Missing 0 images
Created 18 folders in /workspace/problematic_similar_images


In [10]:
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"\nOutput directory: {OUTPUT_BASE}")
print(f"\nProcessed {len(problematic_images)} problematic images")
print(f"  - Found in embeddings: {len(all_results)}")
print(f"  - Not found: {len(not_found)}")

print(f"\nFor each problematic image:")
print(f"  - Created folder with image stem name")
print(f"  - Copied query image as 000_QUERY_*")
print(f"  - Copied top 20 similar images as 001_sim*.jpg, 002_sim*.jpg, etc.")

print(f"\nFolder structure:")
for prob_img in list(all_results.keys())[:3]:
    folder_name = Path(prob_img).stem
    print(f"  {OUTPUT_BASE / folder_name}/")
    print(f"    - 000_QUERY_{prob_img}")
    print(f"    - 001_sim*.jpg to 020_sim*.jpg")
print("  ...")

if not_found:
    print(f"\nNot found in embeddings:")
    for img in not_found:
        print(f"  - {img}")


SUMMARY

Output directory: /workspace/problematic_similar_images

Processed 18 problematic images
  - Found in embeddings: 18
  - Not found: 0

For each problematic image:
  - Created folder with image stem name
  - Copied query image as 000_QUERY_*
  - Copied top 20 similar images as 001_sim*.jpg, 002_sim*.jpg, etc.

Folder structure:
  /workspace/problematic_similar_images/voc_train_005576/
    - 000_QUERY_voc_train_005576.jpg
    - 001_sim*.jpg to 020_sim*.jpg
  /workspace/problematic_similar_images/voc_train_007141/
    - 000_QUERY_voc_train_007141.jpg
    - 001_sim*.jpg to 020_sim*.jpg
  /workspace/problematic_similar_images/voc_valid_001682/
    - 000_QUERY_voc_valid_001682.jpg
    - 001_sim*.jpg to 020_sim*.jpg
  ...


In [11]:
import pandas as pd

similarity_stats = []
for prob_img, similar_list in all_results.items():
    for sim_item in similar_list:
        similarity_stats.append({
            'query': prob_img,
            'similar': Path(sim_item['path']).name,
            'similarity': sim_item['similarity'],
            'label': sim_item['label'],
            'split': sim_item['split']
        })

df = pd.DataFrame(similarity_stats)
print(f"\nSimilarity statistics:")
print(f"  Mean similarity: {df['similarity'].mean():.4f}")
print(f"  Min similarity: {df['similarity'].min():.4f}")
print(f"  Max similarity: {df['similarity'].max():.4f}")
print(f"\nLabel distribution of similar images:")
print(df['label'].value_counts())

csv_path = OUTPUT_BASE / "similarity_results.csv"
df.to_csv(csv_path, index=False)
print(f"\nSaved detailed results to: {csv_path}")



Similarity statistics:
  Mean similarity: 0.9351
  Min similarity: 0.9012
  Max similarity: 0.9767

Label distribution of similar images:
label
train    256
valid     54
test      50
Name: count, dtype: int64

Saved detailed results to: /workspace/problematic_similar_images/similarity_results.csv
