<a href="https://colab.research.google.com/github/lowlypalace/StyleGAN2/blob/main/Data_Collection_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Run Setup
!apt-get install libmagic-dev
!pip install python-magic
!pip install simplejson

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libmagic-dev is already the newest version (1:5.32-2ubuntu0.4).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [None]:
#@title Search Query
#@markdown Set the search query. Use underscore instead of spaces.
key = 'abstract_art' #@param {type:"string"}

In [None]:
#@markdown Set base directory
base = "/content/drive/MyDrive/Data" #@param {type:"string"}

try:
  os.makedirs(base)
except:
  pass

# Data Collection
The Bing Search API allows us to collect images for the lyrics search terms from Bing Image Search.

- To run, simply give the script an image search query `search_query`. It will search the query and download the images into the `out_dir` directory. The images are dropped in subdirectory named the same as `search_query` with underscores.
- Set `num_images` to the total number of pictures to download
- Set `group_size` to how many photos to search per page
- Set `api_key` to your Bing Search API key

In [None]:
#@title ## Set Search Options
from easydict import EasyDict as edict
from requests import exceptions
import argparse
import requests
from PIL import Image
import os
import magic

def setup_search_options(key, **kwargs):
  
  search_query = key.replace("_", " ") #@param {type:"raw"}

  #@markdown Set output directory
  out_dir = os.path.join(base, "datasets", key) #@param {type:"raw"} 
  #@markdown Number of images to search for
  num_images =  500#@param {type:"integer"}
  #@markdown How many images to search per page
  group_size = 50 #@param {type:"integer"}
  #@markdown ---

  #@markdown # Set API Settings
  #@markdown Set the API key
  api_key = "" #@param {type:"string"}
  #@markdown Set the API url
url = "https://api.bing.microsoft.com/v7.0/images/search" #@param {type:"string"}

  headers = {"Ocp-Apim-Subscription-Key": api_key} # Set API headers
  params = {"q": search_query, "offset": 0, "count": group_size} # Set API headers

  args = edict(kwargs.get('search_options', None))
  args.api_args = getattr(args, 'api_args', edict())

  args.search_query = getattr(args, 'search_query', search_query)
  args.num_images = getattr(args, 'num_images', num_images)
  args.group_size = getattr(args, 'group_size', group_size)
  args.api_args.api_key = getattr(args.api_args, 'api_key', api_key)
  args.api_args.url = getattr(args.api_args, 'url', url)
  args.api_args.headers = getattr(args.api_args, 'headers', headers)
  args.api_args.params = getattr(args.api_args, 'params', params)

  return args, out_dir

def get_images(key, search_options, out_dir):
  search_query = search_options.search_query
  num_images = search_options.num_images
  group_size = search_options.group_size
  api_key = search_options.api_args.api_key
  url = search_options.api_args.url
  headers = search_options.api_args.headers
  params = search_options.api_args.params

  exceptions_list = {IOError, FileNotFoundError, exceptions.RequestException, exceptions.HTTPError, exceptions.ConnectionError,
                exceptions.Timeout}

  # Initialize the search
  print("Searching Bing API for '{}'".format(search_query))
  search = requests.get(url, headers=headers, params=params)
  search.raise_for_status()

  results = search.json()
  estNumResults = min(results["totalEstimatedMatches"], num_images)
  print("Found {} results for '{}'".format(estNumResults, search_query))
  print()

  # Initialize the total number of images downloaded
  images_count = 0

  # Loop over the estimated number of results in group_size
  for offset in range(0, estNumResults, group_size):
      # Update the search parameters using the current offset
      print("Making request for group {}-{} of {}...".format(
          offset, offset + group_size, estNumResults))
      params["offset"] = offset
      search = requests.get(url, headers=headers, params=params)
      search.raise_for_status()
      results = search.json()
      print("Saving images for group {}-{} of {}...".format(
          offset, offset + group_size, estNumResults))

      # Loop over the results
      for v in results["value"]:
          try:
              # Make a request to download the image
              # print("Fetching: {}".format(v["contentUrl"]))
              r = requests.get(v["contentUrl"], timeout=30)

              # Save image
              out_img = os.path.join(out_dir, f"{str(images_count).zfill(5)}-{key}")
              with open(out_img, 'wb') as f:
                  f.write(r.content)
                  f.close()

              # Check if an image is an actual image file
              img_type = magic.from_file(out_img, mime=True)
              if (img_type == 'image/jpeg'):
                  os.rename(out_img, f"{out_img}.jpg")
              elif (img_type == 'image/png'):
                  os.rename(out_img, f"{out_img}.png")
              else:
                  # print("Deleting non-image file: {}".format(out_img))
                  os.remove(out_img)
                  continue

          # Catch any errors 
          except Exception as e:
              print(e)
              if type(e) in exceptions_list:
                  # print("Skipping: {}".format(v["contentUrl"]))
                  continue

          # Update the counter
          images_count += 1

  print()
  print(f"Number of saved images: {images_count}")

def run_search(key, **kwargs):

  search_options, out_dir = setup_search_options(key, **kwargs)

  # Print options
  print()
  print('Search options:')
  print(f'Search query:      {search_options.search_query}')
  print(f'Number of images:  {search_options.num_images}')
  print(f'Output directory:  {out_dir}')
  print()

  # Create output directory
  try:
    os.makedirs(out_dir)
  except:
    pass
  
  # Kick off image search
  get_images(key, search_options, out_dir)

In [None]:
#@title ## Run Search
run_search(key)

# Image Clustering 
The clustering algorithm to detect the classes of pictures after they are collected is done. The algorithm uses K-Means for clustering and Tensorflow Keras applications with weights pre-trained on ImageNet for vectorization of the images.

- Set `data_dir` to the directory with images for clustering.
- Set `result_dir` as the output directory. The resulting clusters will be added to as subfolders named `cluster`.
- Set the number of clusters `num_clusters` to be created by the clustering algorithm.
- Select ImageNets (choose from: `"Xception"`, `"VGG16"`, `"VGG19"`, `"ResNet50"`, `"InceptionV3"`, `"InceptionResNetV2"`, `"DenseNet"`, `"MobileNetV2"`). You can also set it to `False` to not use any. Link: [https://keras.io/api/applications/](https://keras.io/api/applications/)


- Set the `num_images` to the number of images to cluster. When set to `None`, all of the images in `data_dir` will be clustered.

In [None]:
#@title ## Set Clustering Options
from easydict import EasyDict as edict

def setup_clustering_options(key, **kwargs):

  #@markdown Path with images for clustering
  data_dir = os.path.join(base, "datasets", key) #@param {type:"raw"}
  #@markdown Path of the output folder
  result_dir = os.path.join(base, "clustering", key) #@param {type:"raw"}
  #@markdown Number of clusters
  num_clusters = 10 #@param {type:"slider", min:1, max:20, step:1} 
  #@markdown Number of examples to use, if "None" all of the images will be used
  num_images = None #@param {type:"raw"}
  #@markdown Set shape
  shape = (224, 224) #@param {type:"raw"}
  #@markdown Select ImageNet 
  use_imagenets = 'Xception' #@param ["False", "'Xception'", "'VGG16'", "'ResNet50'", "'InceptionV3'", "'InceptionResNetV2'", "'DenseNet'"] {type:"raw", allow-input: true}

  if use_imagenets == False:
    use_pca = False
  else:
    #@markdown Use PCA for dimentionaity reduction
    use_pca = False #@param {type:"boolean"}

  paths = os.listdir(data_dir)
  if num_images == None:
    num_images = len(paths)
  else:
    if num_images > len(paths):
      num_images = len(paths)
    else:
      num_images = num_images

  args = edict(kwargs.get('clustering_options', None))

  args.num_clusters = getattr(args, 'num_clusters', num_clusters)
  args.num_images = getattr(args, 'num_images', num_images)
  args.use_imagenets = getattr(args, 'use_imagenets', use_imagenets)
  args.use_pca = getattr(args, 'use_pca', use_pca)
  args.shape = getattr(args, 'shape', shape)

  return args, data_dir, result_dir

import random
import cv2
import os
import sys
import shutil
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import tensorflow

def load_images(num_images, shape, data_dir, result_dir):

  paths = os.listdir(data_dir)
  random.shuffle(paths)
  image_paths = paths[:num_images]

  images = []
  for image in image_paths:
    img = cv2.cvtColor(cv2.resize(cv2.imread(os.path.join(data_dir, image)), tuple(shape)), cv2.COLOR_BGR2RGB) / 255
    images.append(img)
  images = np.array(images)

  return images, image_paths

def imagenet_classify(use_imagenets, use_pca, images):
  if use_imagenets == False:
    images_out = images
  else:
    if use_imagenets.lower() == "vgg16":
      model1 = tensorflow.keras.applications.vgg16.VGG16(include_top=False, weights="imagenet", input_shape=(224,224,3))
    elif use_imagenets.lower() == "vgg19":
      model1 = tensorflow.keras.applications.vgg19.VGG19(include_top=False, weights="imagenet", input_shape=(224,224,3))
    elif use_imagenets.lower() == "resnet50":
      model1 = tensorflow.keras.applications.resnet50.ResNet50(include_top=False, weights="imagenet", input_shape=(224,224,3))
    elif use_imagenets.lower() == "xception":
      model1 = tensorflow.keras.applications.xception.Xception(include_top=False, weights='imagenet',input_shape=(224,224,3))
    elif use_imagenets.lower() == "inceptionv3":
      model1 = tensorflow.keras.applications.inception_v3.InceptionV3(include_top=False, weights='imagenet', input_shape=(224,224,3))
    elif use_imagenets.lower() == "inceptionresnetv2":
      model1 = tensorflow.keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(224,224,3))
    elif use_imagenets.lower() == "densenet":
      model1 = tensorflow.keras.applications.densenet.DenseNet201(include_top=False, weights='imagenet', input_shape=(224,224,3))
    elif use_imagenets.lower() == "mobilenetv2":
      model1 = tensorflow.keras.applications.mobilenetv2.MobileNetV2(input_shape=(224,224,3), alpha=1.0, depth_multiplier=1, include_top=False, weights='imagenet', pooling=None)
    else:
      print("\n\n Please use one of the following keras applications only [ \"vgg16\", \"vgg19\", \"resnet50\", \"xception\", \"inceptionv3\", \"inceptionresnetv2\", \"densenet\", \"mobilenetv2\" ] or False")
      sys.exit()

    pred = model1.predict(images)
    images_temp = pred.reshape(images.shape[0], -1)
    if use_pca == False:
      images_out = images_temp
    else:
      model2 = PCA(n_components=None, random_state=40)
      model2.fit(images_temp)
      images_out = model2
    
    return images_out

def cluster_images(num_clusters, num_images, data_dir, result_dir, image_paths, images_out):
  model = KMeans(n_clusters=num_clusters, n_jobs=-1, random_state=40)
  model.fit(images_out)
  predictions = model.predict(images_out)

  # Copy images to result_dir
  for i in range(num_images):
    name, ext = os.path.splitext(image_paths[i])
    img_name = f"{name}-cluster{str(predictions[i])}{ext}"
    src = os.path.join(data_dir, image_paths[i])
    dst = os.path.join(result_dir, f"cluster{str(predictions[i])}", img_name)
    shutil.copy2(src, dst)

def run_clustering(key, **kwargs):

  clustering_options, data_dir, result_dir = setup_clustering_options(key, **kwargs)

  # Print options
  print()
  print('Clustering options:')
  print(f'Number of clusters:     {clustering_options.num_clusters}')
  print(f'Number of images:       {clustering_options.num_images}')
  print(f'Selected ImageNet:      {clustering_options.use_imagenets}')
  print(f'Data directory:         {data_dir}')
  print(f'Output directory:       {result_dir}')
  print()

  # Create output directory
  try:
    shutil.rmtree(result_dir)
  except FileNotFoundError:
    pass
  os.makedirs(result_dir)
  
  # Create cluster subdirectories
  for i in range(clustering_options.num_clusters):
    os.makedirs(os.path.join(result_dir, f"cluster{str(i)}"))

  # Load images
  print(f"Loading images from {data_dir}")
  images, image_paths = load_images(clustering_options.num_images, clustering_options.shape, data_dir, result_dir)
  print(f"{clustering_options.num_images} images have been loaded in a random order")
  print()

  # Classify with ImageNet
  print(f"Classifying images with ImageNet")
  print()
  images_out = imagenet_classify(clustering_options.use_imagenets, clustering_options.use_pca, images)

  # Kick off clustering
  print(f"Starting clustering.")
  cluster_images(clustering_options.num_clusters, clustering_options.num_images, data_dir, result_dir, image_paths, images_out)
  print(f"Successfully added images to {clustering_options.num_clusters} clusters")

In [None]:
#@title ## Run Clustering
run_clustering(key)

# Merge Clusters
This allows to merge selected cluster into one directory.

- Set `clusters` to the list of clusters. For example, `clusters = [0, 1, 7]`.
- Set `data_dir` to the data directory with clusters subfolders.
- Set `out_dir` to the output directory.


In [None]:
from easydict import EasyDict as edict
import os

def setup_merging_options(key, **kwargs):

  #@markdown Set list of clusters
  clusters = [0, 2, 3, 6, 7] #@param {type:"raw"}
  
  args = edict(kwargs.get('merging_options', None))
  args.clusters = getattr(args, 'clusters', clusters)

  #@markdown Set data directory
  data_dir = os.path.join(base, "clustering", key) #@param {type:"raw"}
  #@markdown Set output directory
  out_dir = os.path.join(base, "out", key) #@param {type:"raw"}

  return args, data_dir, out_dir

def merge_clusters(key, **kwargs):

  merging_options, data_dir, out_dir = setup_merging_options(key, **kwargs)
  clusters = merging_options.clusters

  print()
  print('Merging options:')
  print(f'Selected clusters:      {clusters}')
  print(f'Data directory:         {data_dir}')
  print(f'Output directory:       {out_dir}')
  print()

  if not clusters:
    print("No clusters selected.")
  else:
    # Create output directory
    try:
      os.makedirs(out_dir)
    except:
      pass

    # Copy files from a cluster
    for cluster_num in clusters:
      src = os.path.join(data_dir, f"cluster{cluster_num}")
      src_files = os.listdir(src)

      for file_name in src_files:
        full_file_name = os.path.join(src, file_name)
        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, out_dir)
            
    # Get a number of files in the dataset
    path, dirs, files = next(os.walk(out_dir))
    file_count = len(files)
    print(f"Completed. The merged clusters have {file_count} images")

In [None]:
#@title ## Merge Clusters
merge_clusters(key)

# Run Multiple Search Queries

- Set `data` to the json with search queries and clusters to merge. 
- To search images, set `search = True`.
- To cluster images, set `cluster = True`.
- To merge clusters, set `merge = True`.

In [None]:
#@title Edit JSON

from easydict import EasyDict as edict
import json

def create_json():

  out_dir = os.path.join(base, "tmp") # Set output directory

  try:
    os.makedirs(out_dir)
  except:
    pass

  # Set the search queries
  data = """
{
    "car": [
        {
            "merging_options": {
                "clusters": [1,2]
            },
            "clustering_options" : {},
            "search_options": {
                "num_images": 50
            }
        }
    ],
    "phone": [
        {
            "merging_options": {
                "clusters": [] 
            },
            "clustering_options" : {},
            "search_options": {
                "num_images": 50
            }
        }
    ]
}

  """

  keys = edict(json.loads(data))

  with open(os.path.join(out_dir, 'keys.json'), 'wt') as f:
      json.dump(keys, f, sort_keys=True, indent=4)
    
  return keys

keys = create_json()

The code snippet below allows to run multiple tasks in parallel making it faster to search, cluster and merge datasets.


In [None]:
#@title Run Tasks
#@markdown Run multiple tasks in parallel
from multiprocessing import Pool

def log(key):

  out_dir = out_dir = os.path.join(base, "tmp", "logs")

  try:
    os.makedirs(out_dir)
  except:
    pass

  sys.stdout = open(os.path.join(out_dir, f'{key}.txt'),'w')

def main(key):
  
  #@markdown ### Save logs
  log_out = True #@param ["False", "True"] {type:"raw"}
  #@markdown ---

  #@markdown ### Select tasks
  search = False #@param {type:"boolean"}
  cluster = False #@param {type:"boolean"}
  merge = False #@param {type:"boolean"}
  #@markdown ---

  if log_out:
    log(key)

  for i in keys[key]:

    if search:
      search_options = i['search_options']
      run_search(key, search_options = search_options)
      
    if cluster:
      clustering_options = i['clustering_options']
      run_clustering(key, clustering_options = clustering_options)

    if merge:
      merging_options = i['merging_options']
      merge_clusters(key, merging_options = merging_options)

if __name__ == "__main__":
  x = [x for x in keys]
      
  with Pool(3) as p:
      p.map(main, x)

This code snippet runs sequentially. It's useful to use it with a few search queries.

In [None]:
#@title Run Tasks

def main():

  search = False #@param {type:"boolean"}
  cluster = False #@param {type:"boolean"}
  merge = False #@param {type:"boolean"}

  for key in keys:
    for i in keys[key]:

      if search:
        search_options = i['search_options']
        run_search(key,
                   num_images = search_options.num_images,
                   group_size = search_options.group_size,
                   )
        
      if cluster:
        clustering_options = i['clustering_options']
        run_clustering(key,
                       num_clusters = clustering_options.num_clusters,
                       use_imagenets = clustering_options.use_imagenets)

      if merge:
        merging_options = i['merging_options']
        merge_clusters(key, merging_options.clusters)
        
if __name__ == "__main__":
    main()

# Zip Dataset
Don't forget to clean data before zipping

In [None]:
import os
import zipfile

def zipit():

    base_dir = os.path.join(base, "out")

    zip_name = "dataset" #@param

    dir_names = ['dataset1', 'dataset2'] #@param {type:"raw"}

    folders = []
    for dir in dir_names:
      path = os.path.join(base_dir, dir)
      folders.append(path)

    count_images = 0

    zip_filename = os.path.join(base, "zips", f"{zip_name}.zip")

    zip_file = zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED)

    for folder in folders:
        for dirpath, dirnames, filenames in os.walk(folder):
            for filename in filenames:
                count_images += 1
                zip_file.write(
                    os.path.join(dirpath, filename), filename)

    zip_file.close()

    print(f"{count_images} images have been zipped as {zip_name}.zip")

In [None]:
zipit()