Installing required libraries

In [None]:
!pip install icrawler

In [None]:
from google.colab import drive

import re
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from icrawler.builtin import GoogleImageCrawler


%matplotlib inline

In [None]:
drive.mount('/content/drive', force_remount=True)

In [None]:
%cd /content/drive/MyDrive/

Crawler Configuration

In [None]:
google_crawler = GoogleImageCrawler(
    feeder_threads=1,
    parser_threads=2,
    downloader_threads=4,
    storage={'root_dir': './feather-in-focus/data_augmentation/test_full_data'})
filters = dict(
    size='medium',
    #color='orange',
    #license='commercial,modify',
    #date=((2017, 1, 1), (2017, 11, 30))
    )


In [None]:
ROOT = "./feather-in-focus"
TRAINING_PATH = "./feather-in-focus/train_images"
TEST_PATH = "./feather-in-focus/test_images"

In [None]:
attributes = np.load(ROOT + '/attributes.npy')
df_attributes = pd.DataFrame(attributes)
class_names = np.load(ROOT + '/class_names.npy', allow_pickle=True).item()
class_names_df = pd.DataFrame(list(class_names.items()), columns=['Class', 'Description'])
class_names_df.set_index('Description', inplace=True)
train_images = pd.read_csv(ROOT + "/train_images.csv")
test_images = pd.read_csv(ROOT + "/train_images.csv")

In [None]:
# Merge with class labels
final_df = train_images.merge(class_names_df, left_on='label', right_on='Description')

In [None]:
final_df["Class"].value_counts()

# Crawler

In [None]:
def get_last_image(df):
  last_index = df.index[-1]
  last_path = df.iloc[last_index, 0]
  file_number = re.findall(r'\d+', last_path)

  if len(file_number) != 1:
     raise ValueError("Expected EXACTLY one number in the last path.")

  return int(file_number[0])+1

def get_available_images(y_label):

  return final_df["label"].value_counts().loc[y_label]

def get_search_term(label, extra_text=''):

  input_string = class_names_df.loc[label]["Class"]
  pattern = r'\d+\.(.+)$'

  match = re.search(pattern, input_string)
  if match:
      result = match.group(1).replace('_', ' ')
      return result + ' ' + extra_text
  else:
      return "Twitter logo"

def crawl_for_birds(label, aditional_info, number_img, file_idx_offset,filters=filters, google_crawler=google_crawler):
  keyword = get_search_term(label, aditional_info)
  print(f"Downloading label {label} ({keyword}), a numer of {number_img} with a starting index of {file_idx_offset}")
  google_crawler.crawl(keyword=keyword, filters=filters, max_num=number_img, file_idx_offset=file_idx_offset)


# Calling the crawler over the whole data

In [None]:
img_per_class = 50
next_image = get_last_image(train_images)
folder = "/train_images/"
extension = ".jpg"
flag = 0
augmented_df = pd.DataFrame(columns=['image_path', 'label'])


for label in final_df["label"].value_counts().index:
  images_needed = img_per_class - get_available_images(label)
  if flag != 0:
    next_image = get_last_image(augmented_df)
  else:
    flag = 1

  ## Call the crawler
  crawl_for_birds(label, "alone", images_needed, next_image)

  new_images = {
    'image_path': [folder + str(value) + extension
                   for value in range(next_image,
                                      next_image + images_needed)],
    'label': [label] * int(images_needed)
  }

  loop_df = pd.DataFrame(new_images)

  augmented_df = pd.concat([augmented_df, loop_df])

Sending DF to CSV

In [None]:
augmented_df.to_csv('augmented_dataframe')

In [None]:
augmented_df["label"].value_counts().sum()

In [None]:
os.listdir("./feather-in-focus/data_augmentation/test_full_data")