# Crawl Google search Images
by picking a keyword + get its hyponyms

In [1]:
! pip install icrawler==0.6.6

Collecting icrawler==0.6.6
  Downloading icrawler-0.6.6-py2.py3-none-any.whl (35 kB)
Installing collected packages: icrawler
Successfully installed icrawler-0.6.6


In [2]:
import numpy as np 
import pandas as pd 
import os
from nltk.corpus import wordnet as wn
from icrawler.builtin import GoogleImageCrawler
import shutil
from datetime import date

In [3]:
imagenet_classes_path='/kaggle/input/imagenet-synset-classes/' # insert your own here
with open(imagenet_classes_path+'imagenet_synsets_classes.txt') as f:
    lines = f.readlines()

In [4]:
print(lines[0:10])

['n02119789 1 kit_fox\n', 'n02100735 2 English_setter\n', 'n02110185 3 Siberian_husky\n', 'n02096294 4 Australian_terrier\n', 'n02102040 5 English_springer\n', 'n02066245 6 grey_whale\n', 'n02509815 7 lesser_panda\n', 'n02124075 8 Egyptian_cat\n', 'n02417914 9 ibex\n', 'n02123394 10 Persian_cat\n']


### Select keyword and find hyponyms

In [5]:
pick = 'cat'                                 # you can add another keyword here
wn_pick = wn.synsets(pick, pos=wn.NOUN)[0]   # find the first WordNet synset of the picked keyword
print(wn_pick)
    
# get all hyponyms of selected keyword (hyponyms of any depth)
pick_hypo = list(set([w for s in wn_pick.closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
c = [i.replace("_", " ").lower() for i in pick_hypo]

# add the picked keyword (e.g. cat) to disambiguate classes
wordnet_classes = list(set([i+' '+pick if pick not in i else i for i in c]))
print(wordnet_classes)
print(len(wordnet_classes))

Synset('cat.n.01')
['puma cat', "pallas's cat", 'serval cat', 'egyptian cat', 'painter cat', 'siamese cat', 'felis serval cat', 'tabby cat', 'felis chaus cat', 'bay lynx cat', 'mouser cat', 'calico cat', 'eyra cat', 'persian cat', 'mountain lion cat', 'lynx lynx cat', 'gib cat', 'catamountain', 'felis domesticus cat', 'margay cat', 'puss cat', 'maltese cat', 'tom cat', 'wildcat', 'felis bengalensis cat', 'jaguarundi cat', 'leopard cat', 'felis tigrina cat', 'burmese cat', 'felis wiedi cat', 'canada lynx cat', 'manx cat', 'tortoiseshell cat', 'bobcat', 'queen cat', 'felis catus', 'felis manul cat', 'blue point siamese cat', 'abyssinian cat', 'pussy cat', 'panther cat', 'manul cat', 'domestic cat', 'tortoiseshell-cat', 'angora cat', 'common lynx cat', 'tomcat', 'lynx cat', 'european wildcat', 'jaguarondi cat', 'caracal cat', 'felis pardalis cat', 'alley cat', 'felis ocreata cat', 'kitty cat', 'sand cat', 'ocelot cat', 'felis yagouaroundi cat', 'desert lynx cat', 'lynx canadensis cat', 'f

### Filter out hyponyms not belonging to ImageNet class labels

In [6]:
wordnet_imagenet_classes = []
all_imagenet_classes = []
for item in lines:
    imagenet_class = item.split(' ')[2]   # according to the .txt format, the last entry is the class name
    imagenet_class = imagenet_class.replace('_', ' ').lower().rstrip()
    all_imagenet_classes.append(imagenet_class)    

# let's keep the intersection between ImageNet and WordNet hyponyms
for imagenet_class in all_imagenet_classes:
    # is ImageNet class also a WordNet hyponym of the selected keyword?
    if (imagenet_class+' cat' in wordnet_classes):
        wordnet_imagenet_classes.append(imagenet_class+' cat')
    elif (imagenet_class in wordnet_classes):
        wordnet_imagenet_classes.append(imagenet_class)
wordnet_imagenet_classes.append(pick)

# those are the final classes for classification
print(len(wordnet_imagenet_classes))
print(wordnet_imagenet_classes)

11
['egyptian cat', 'persian cat', 'cougar cat', 'tiger cat', 'tiger cat', 'leopard cat', 'siamese cat', 'angora cat', 'tabby cat', 'lynx cat', 'cat']


### Gather web images

In [7]:
def crawl_images(class_name, dir_name):
    filters = dict(date=((2005, 1, 1), (2023, 1, 1)))
    google_crawler = GoogleImageCrawler(storage={'root_dir': dir_name})
    google_crawler.crawl(class_name, max_num=50)

    return

In [8]:
# crawl images for keyword+hyponyms
for class_name in wordnet_imagenet_classes:
    print(class_name)
    crawl_images(class_name, class_name)
    #print(len([name for name in os.listdir('./'+class_name)]))
    
# store images in .zip
shutil.make_archive('crawl_data_'+pick, 'zip', './')

egyptian cat
persian cat
cougar cat
tiger cat
tiger cat
leopard cat
siamese cat
angora cat
tabby cat
lynx cat
cat


'/kaggle/working/crawl_data_cat.zip'