In [0]:
import numpy as np
import os
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [0]:
def angular_distance(im1, im2):
  v1 = np.array(im1.histogram())
  v2 = np.array(im2.histogram())
  v1_normalized = v1 / np.linalg.norm(v1)
  v2_normalized = v2 / np.linalg.norm(v2)

  return np.arccos(np.clip(np.dot(v1_normalized, v2_normalized), -1, 1))

def averaged_distance(im1, im2):
  v1 = np.array(im1.getdata())
  v2 = np.array(im2.getdata())
  
  v1_norms = np.linalg.norm(v1, axis = 1)
  v2_norms = np.linalg.norm(v2, axis = 1)

  dot_prods = np.einsum('ij, ij -> i', v1, v2)
  normalized_dot_prods = dot_prods / (v1_norms * v2_norms)

  return np.arccos(np.clip(np.nanmean(normalized_dot_prods), -1, 1))

We make the assumption that the images are in an order such that clusters only grow left and right within that order (a sort of spatial locality). This isn't too insane of an assumption as similar-looking images were most likely taken one after another and so their file names will also be in sequence.

The assumption is necessary so I don't have to wait for this to run for hours. :)

In [0]:
def simplified_clusters(img_paths, thresh, distftn):
  clusters = []
  path = img_paths.pop(0)
  base_image = Image.open(path).resize((224, 224), resample = Image.LANCZOS).convert('RGB')
  clusters.append([path])
  while len(img_paths) != 0:
    print(2009 - len(img_paths))
    path = img_paths.pop(0)
    curr_image = Image.open(path).resize((224, 224), resample = Image.LANCZOS).convert('RGB')
    if distftn(base_image, curr_image) <= thresh:
      clusters[-1].append(path)
    else:
      clusters.append([path])

  return clusters    

In [0]:
ICELAND_IMAGES_ROOT = './drive/My Drive/Iceland'
image_file_paths = [
  os.path.join(ICELAND_IMAGES_ROOT, item) 
  for item in os.listdir(ICELAND_IMAGES_ROOT) 
  if item != 'heic'
]

cltrs_angular = simplified_clusters(image_file_paths, 1, angular_distance)
cltrs_averaged = simplified_clusters(image_file_paths, 0.1, averaged_distance)

In [0]:
import random
import string

def show_cluster(cluster):
  n_rows = 2
  for i in reversed(range(2, int(np.sqrt(len(cluster))) + 1)):
    if len(cluster) % i == 0:
      n_rows = i
      break

  n_cols = len(cluster) // n_rows

  fig, axs = plt.subplots(n_rows, n_cols)
  for i in range(n_rows):
    for j in range(n_cols):
      img = mpimg.imread(cluster[n_rows * i + j])
      axs[i, j].imshow(img)

  fig.show()
  fig.savefig('./drive/My Drive/Colab Notebooks/' + ''.join([random.choice(string.ascii_letters + string.digits) for n in range(32)]) + '.png')

In [0]:
rand = random.randint(0, len(cltrs_angular) - 1)
while len(cltrs_angular[rand]) <= 4:
  rand = random.randrange(0, len(cltrs_angular))

show_cluster(cltrs_angular[rand])

In [0]:
rand = random.randint(0, len(cltrs_averaged) - 1)
while len(cltrs_averaged[rand]) <= 4:
  rand = random.randrange(0, len(cltrs_averaged))

show_cluster(cltrs_averaged[rand])

Below is a multithreaded version of our clustering function without the spatial assumption made in "simplified_clustering" above.


In [0]:
from multiprocessing import Pool
from functools import partial

def return_dist(base_image, distftn, path):
  print('{}'.format(path))
  if path['cluster'] != -1:
    return None

  image = Image.open(path['path']).resize((224, 224), resample = Image.LANCZOS)
  return distftn(base_image, image)

def clusters(img_paths, thresh, distftn):
  augmented_img_paths = [{'cluster': -1 , 'path': path } for path in img_paths]

  num_clusters = -1
  for index, path in enumerate(augmented_img_paths):
    print('{}/{}'.format(index + 1, len(augmented_img_paths)))
    if path['cluster'] != -1:
      continue

    num_clusters += 1
    path['cluster'] = num_clusters
    base_image = Image.open(path['path']).resize((224, 224), resample = Image.LANCZOS)

    with Pool(150) as p:
      dists = p.map(partial(return_dist, base_image, distftn), augmented_img_paths[index + 1:])
      
      for i, dist in enumerate(dists):
        if dist is None:
          continue
        
        if dist <= thresh:
          augmented_img_paths[index + i + 1]['cluster'] = num_clusters
  
  clusters = [[] for i in range(num_clusters)]
  for path in augmented_img_paths:
    clusters[path['cluster']].append(path['path'])
  
  return clusters