<a href="https://colab.research.google.com/github/pcashman21/feral-cat-census/blob/main/src/notebooks/cluster_by_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook performs the following steps:


1.   Run YOLO v8 on all raw images to get cropped-image files based on detecting either entire cats or just cat faces/heads.
2.   If necessary, run a step to check that all the cropped-image files are usable.  In the case of cat faces, we assume they all are.  (Actually, we can filter out those that aren't based on file name.) In the case of full cat bodies, run each image through a model that can determine usability.
3.   Run a transformer to compute a dense vector representation of all the cropped images, and compute a similarity score for each pair of images.
4.   Assign cluster IDs first to all the images that are above a given similarity score threshold, and then to all remaining unclustered images (each of which becomes a singleton cluster).  The number of clusters is the estimated number of distinct cats.



In [7]:
# for loading/processing the images
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input

# models
from keras.models import Model, load_model
from keras import layers

# for everything else
from google.colab import drive
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imread
import pandas as pd
import tensorflow as tf
import cv2
from PIL import Image
import glob

# YOLO
!pip install ultralytics
from ultralytics import YOLO

# transformer
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util


Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 k

In [8]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
# Path to trained YOLO model for image detection (cats or cat faces)
path_to_best_cat_detector = '/content/gdrive/My Drive/Cat images/feral-cat-faces.v1i.yolov8/runs/detect/train/weights/best.pt'
cat_detector_model = YOLO(path_to_best_cat_detector)  # load the cat (or cat face) detector model

# Path to trained model for detecting usable images.  If set to '', then we are simply accepting all images
path_to_usable_image_selector_model = '' # '/content/gdrive/My Drive/Cat images/models/select_usable_images.keras'
if path_to_usable_image_selector_model == '':
  usable_image_selector_model = None
else:
  usable_image_selector_model = load_model(path_to_usable_image_selector_model) # load usable/unsuable image selector model

# Name of object class for YOLO to detect ('cat' for entire cat in any pose, 'cat-face' for just cat faces/heads)
class_to_detect = 'cat-face' # 'cat' for whole cat, 'cat-face' for just faces

Load the zip file of cat images into the working storage of the notebook by clicking the folder in the left nav, and then the upload symbol in the menu bar.  Adjust the `path_to_images` variable to point to the unzipped folder.

In [2]:
# Path to raw images of cats (BE SURE TO UPLOAD AND UNZIP)
path_to_images = '/content/nano'
zip_file_name = path_to_images +'.zip'


import zipfile
!unzip $zip_file_name


Archive:  /content/nano.zip
   creating: nano/
  inflating: __MACOSX/._nano         
  inflating: nano/275-Marcel-508-222-Marcel-432-161002_DSC_0388.jpg.jpg.jpg  
  inflating: __MACOSX/nano/._275-Marcel-508-222-Marcel-432-161002_DSC_0388.jpg.jpg.jpg  
  inflating: nano/758-Sahara-IMG_4792.JPG.jpg  
  inflating: __MACOSX/nano/._758-Sahara-IMG_4792.JPG.jpg  
  inflating: nano/236-Flyers-446-160928_DSC_0029.jpg.jpg  
  inflating: __MACOSX/nano/._236-Flyers-446-160928_DSC_0029.jpg.jpg  
  inflating: nano/758-Sahara-IMG_4954.JPG.jpg  
  inflating: __MACOSX/nano/._758-Sahara-IMG_4954.JPG.jpg  
  inflating: nano/258-Farley-494-170201_DSC_2321-2.jpg.jpg  
  inflating: __MACOSX/nano/._258-Farley-494-170201_DSC_2321-2.jpg.jpg  
  inflating: nano/232-Zetta_-photo-output_152.JPG.jpg  
  inflating: __MACOSX/nano/._232-Zetta_-photo-output_152.JPG.jpg  
  inflating: nano/238-Flyers-448-160928_DSC_0048.jpg.jpg  
  inflating: __MACOSX/nano/._238-Flyers-448-160928_DSC_0048.jpg.jpg  
  inflating: nano/25

In [25]:
path_to_cropped_images = '/content/runs/detect/cropped_images_2'
path_to_cropped_and_classified_images = os.path.join(path_to_cropped_images, class_to_detect)

In [26]:
test_case_images = []
df = pd.DataFrame(columns=['orig_image_filename', 'cropped_image_filename', 'cluster_id'])

# Read all image files
with os.scandir(path_to_images) as files:
  for file in files:
    last_component = file.name.split('.')[-1]
    if last_component == 'jpg' or last_component == 'jpeg':
      test_case_images.append(file.name)

# Run the YOLOv8 model (the cat detector) and get a YOLO Results object for
# each file named in the df
# https://docs.ultralytics.com/modes/predict/#working-with-results
root_filename = 'img'
root_filename_count = 0
for test_case in test_case_images:
  results = cat_detector_model.predict(source=os.path.join(path_to_images, test_case))
  # Extract the image sub areas and save them as files
  for r in results:
    cropped_image_file_name = root_filename + str(root_filename_count) + '.jpg'
    df = pd.concat([df, pd.DataFrame([{'orig_image_filename': test_case, 'cropped_image_filename': cropped_image_file_name, 'cluster_id': -1 }], index=[len(df)])],  axis=0)
    r.save_crop(save_dir=path_to_cropped_images, file_name=cropped_image_file_name)
    root_filename_count += 1

if usable_image_selector_model is not None:

  # The cat detector isn't perfect.  It finds cats where there are none, and
  # doesn't always split up multiple cats into individual images.  So run the
  # YOLOv8 resulting cropped image files through the usable/unusable model
  # and delete any files that don't have a single cat.

  # We're reading raw images from the cropped image files, so transform them
  # by normalizing the pixels and resizing the image
  transformer_nn = tf.keras.Sequential([
        layers.Resizing(244,244),
        layers.Rescaling(1./255)])

  # Compile a list of all the cropped images
  cropped_image_files = []
  unusable_images_count = 0
  with os.scandir(path_to_cropped_and_classified_images) as files:
    for file in files:
      # YOLO will save cropped images which are unusable.  These have YOLO-generated names
      # that don't appear in the df.  If we are given such a file to process, ignore it.
      if file.name not in df['cropped_image_filename'].values:
        continue

      image = imread(os.path.join(path_to_cropped_and_classified_images, file.name)) # Get the cropped image
      # Transform the image by resizing and rescaling
      image_resized = transformer_nn(image)
      # Convert tensor to numpy array
      image_resized = image_resized.numpy()
      # Turn the image into a batch of 1
      image_batch = np.expand_dims(image_resized, axis=0)
      # Predict the batch outcome
      is_usable = usable_image_selector_model.predict(image_batch)[0] > .5
      if is_usable:
        cropped_image_files.append(file.name)
      else:
        print('Unusable image: {}'.format(file.name))
        unusable_images_count += 1
        df = df.drop(df.index[df['cropped_image_filename'] == file.name], axis=0)
  print('Total of {} unusable images and {} usable images'.format(unusable_images_count, len(cropped_image_files)))
else:
  print('Total of {} usable images'.format(len(df)))



image 1/1 /content/nano/232-Zetta_-IMG_1025.jpeg.jpg: 640x576 1 cat-face, 283.7ms
Speed: 15.6ms preprocess, 283.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 576)

image 1/1 /content/nano/246-Flynn-459-161004_DSC_0496.jpg.jpg: 640x640 1 cat-face, 380.0ms
Speed: 11.5ms preprocess, 380.0ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /content/nano/382-Kestre-12792317_1018119834929547_2579333533442259492_o.jpg.jpg: 640x640 1 cat-face, 384.2ms
Speed: 7.4ms preprocess, 384.2ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /content/nano/226-Nat-456-161004_DSC_0565.jpg.jpg: 640x640 1 cat-face, 352.1ms
Speed: 7.1ms preprocess, 352.1ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /content/nano/213-Floyd-423-161002_DSC_0365.jpg.jpg: 640x640 1 cat-face, 406.9ms
Speed: 6.8ms preprocess, 406.9ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /content/nano/245-M

img72.jpg was written


image 1/1 /content/nano/263-Chanel-15440475_1287461311328730_3703842762570776666_o.jpg.jpg: 640x640 1 cat-face, 399.7ms
Speed: 8.5ms preprocess, 399.7ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /content/nano/277-Addiso-510-202-Addiso-412-160929_DSC_0269.jpg.jpg.jpg: 640x640 1 cat-face, 350.3ms
Speed: 5.3ms preprocess, 350.3ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /content/nano/207-Arya-417-161002_DSC_0456.jpg.jpg: 640x640 1 cat-face, 349.4ms
Speed: 6.5ms preprocess, 349.4ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /content/nano/230-Beatri-440-161002_DSC_0442.jpg.jpg: 640x640 1 cat-face, 368.0ms
Speed: 6.8ms preprocess, 368.0ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /content/nano/304-Esmere-542-FullSizeRender.jpg-2.jpeg.jpg: 640x640 (no detections), 233.3ms
Speed: 5.2ms preprocess, 233.3ms inference, 0.7ms postprocess per image at shape (1, 3, 

Total of 95 usable images


In [34]:
# TODO: Why can files be missing?
for file in df['cropped_image_filename'].values:
    if not os.path.isfile(os.path.join(path_to_cropped_and_classified_images, file)):
      print('Missing file: {}'.format(file))
      df = df.drop(df.index[df['cropped_image_filename'] == file], axis=0)






Missing file: img72.jpg
Missing file: img77.jpg


In [35]:
len(df)

93

In [36]:
# Load the OpenAI CLIP Model
model = SentenceTransformer('clip-ViT-B-32')

# Next we compute the embeddings
image_names = df['cropped_image_filename'].values # All the names of YOLO-produced cropped images
encoded_image = model.encode([Image.open(os.path.join(path_to_cropped_and_classified_images, filepath)) for filepath in image_names], batch_size=128, convert_to_tensor=True, show_progress_bar=True)

# Now we run the clustering algorithm. This function compares images against
# all other images and returns a list with the pairs that have the highest
# cosine similarity score
processed_images = util.paraphrase_mining_embeddings(encoded_image)
df_processed_images = pd.DataFrame(processed_images, columns=['score', 'image_id1', 'image_id2'])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
CLUSTER_THRESHOLD = 0.95
next_cluster_id = 1
df['cluster_id'] = -1


def does_pair_form_cluster(row):
    if row['score'] >= CLUSTER_THRESHOLD:
        return True
    else:
        return False


def get_cluster_image_names(row):
    return [image_names[row['image_id1']], image_names[row['image_id2']]]


def assign_cluster_id(row):
  global next_cluster_id
  if row['is_cluster']:
      file1, file2 = get_cluster_image_names(row)
      idx1 = df[df['cropped_image_filename'] == file1].index
      idx2 = df[df['cropped_image_filename'] == file2].index
      if len(idx1) == 1 and len(idx2) == 1:
        if int(df['cluster_id'].loc[idx1]) > -1:
          df['cluster_id'].loc[idx2] = int(df['cluster_id'].loc[idx1])
        elif int(df['cluster_id'].loc[idx2]) > -1:
          df['cluster_id'].loc[idx1] = int(df['cluster_id'].loc[idx2])
        else:
          df['cluster_id'].loc[idx1] = int(next_cluster_id)
          df['cluster_id'].loc[idx2] = int(next_cluster_id)
          next_cluster_id += 1

def assign_unclustered_images(row):
  global next_cluster_id
  if row['cluster_id'] == -1:
      # row['cluster_id'] = next_cluster_id
      next_cluster_id += 1
      return next_cluster_id - 1
  else:
      return row['cluster_id']

In [37]:
# Determine which image pairs are close enough to belong to the same cluster.  If a row in
# df_processed_images can be clustered, mark it as true.
df_processed_images['is_cluster'] = df_processed_images.apply(does_pair_form_cluster, axis=1)

# For all the clusterable pairs, assign cluster IDs in df (NOT df_processed_images)
df_processed_images.apply(assign_cluster_id, axis=1)

# Finally, assign cluster IDs to all images which have not been put into a cluster already.
# These are singleton images.
df['cluster_id'] = df.apply(assign_unclustered_images, axis=1)

# How many distinct clusters (cats) are there?
df['cluster_id'].max()

70