# Datasets Downloader


Download images from three different sources: Bing, DuckDuckGo, Google


## Setup


In [None]:
import os

In [None]:
# Utils import

import sys
rootFolder = os.path.abspath(os.path.dirname(
    os.path.dirname(os.path.abspath(__file__))))
sys.path.append(rootFolder)

from utils.tasks import *

### Settings


In [None]:
datasetBaseFolder = "../tmp/" + currentTask
datasets = ["bing", "ddg", "google"]

bingFolder = os.path.join(datasetBaseFolder, "bing")
ddgFolder = os.path.join(datasetBaseFolder, "ddg")
googleFolder = os.path.join(datasetBaseFolder, "google")

#### Cats and Dogs


In [None]:
if currentTask == CAT_DOG:
    classes = ["cat", "dog"]

    queries = [
        ["cat", "little cat", "small cat", "calico cat", "cute cat", "norwegian forest cat", "black cat",
            "orange cat", "grey cat", "white cat", "fluffy cat", "siamese cat", "tabby cat",
            "munchkin cat", "persian cat", "angora cat", "bengal cat", "chartreux cat", "savannah cat", "ragdoll cat"],

        ["dog", "little dog", "small dog", "brown dog", "cute dog",
        "big dog", "black dog", "boxer dog", "grey dog", "white dog",
        "german shepherd dog", "golden retriever dog", "labrador dog", "samoyed dog", "siberian husky dog",
        "chihuahua dog", "bulldog", "doberman dog", "pug dog", "rottweiler dog"]
    ]


#### Men and Women


In [None]:
if currentTask == MAN_WOMAN:
  classes = ["man", "woman"]

  queries = [
      ["uomo", "man", "male", "blond man", "red hair man", "brunette man", "black hair man", "tall man", "short male", "asian man",
          "caucasian man", "black man", "fat man", "thin man", "fit man", "italian man", "japanese man", "american man", "old man", "young man"],

      ["donna", "woman", "female", "blond female", "red hair woman", "brunette woman", "black hair woman", "tall female", "short female", "asian woman",
      "caucasian woman", "black woman", "fat woman", "thin woman", "fit woman", "italian woman", "japanese woman", "american woman", "old woman", "young woman"]
  ]

#### Bike and Motorbikes

In [None]:
if currentTask == BIKE_MOTORBIKE:
    classes = ["bike", "motorbike"]

    queries = [
        ["bicicletta", "bike", "city bike", "electric bike", "bianchi bike", "pinarello bike", "giant bike", "trek bike", "specialized bike", "cannodale bike", "scott bike",
            "kona bike", "black bike", "white bike", "red motorbike", "mountain bike", "graziella bike", "blue bike", "bmx bike", "green bike", "bici con cestino"],

        ["moto", "motorbike", "yamaha motorbike", "aprilia motorbike", "bmw motorbike", "benelli motorbike", "beta motorbike", "ducati motorbike", "harlet davidson motorbike", "honda motorbike",
        "black motorbike", "white motorbike", "red motorbike", "motocross", "copper motorbike", "old motorbike", "blue motorbike", "green motorbike", "yellow motorbike", "orange motorbike"]
    ]

### Create temporary folders where download images


In [None]:
for dataset in datasets:
    dataFolder = os.path.join(datasetBaseFolder, dataset)

    for cls in classes:
        clsFolder = os.path.join(dataFolder, cls)

        if not os.path.exists(clsFolder):
            print("[📂 CREATED FOLDER] {}".format(clsFolder))
            os.makedirs(clsFolder)


## Downloader


### Bing


In [None]:
from bbid import bing_downloader

print("[💾 BING DOWNLOADER]")

for query, folder in zip(queries, classes):
    print("\n[🗃️ INFO] Downloading images for class {}".format(folder))

    classDir = os.path.join(bingFolder, folder)

    for q in query:
        print("[🔍 INFO] Downloading images for query {}".format(q))

        downloadDir = os.path.join(classDir, q)
        bing_downloader(type("obj", (object,), {
            "search_string": q,
            "output": downloadDir,
            "limit": 400,
            "inline": False,
        }))


### DuckDuckGo


In [None]:
import DuckDuckGoImages as ddg

print("[💾 DDG DOWNLOADER]")

for query, folder in zip(queries, classes):
    print("\n[🗃️ INFO] Downloading images for class {}".format(folder))

    classDir = os.path.join(ddgFolder, folder)

    for q in query:
        print("[🔍 INFO] Downloading images for query {}".format(q))

        downloadDir = os.path.join(classDir, q)
        if not os.path.exists(downloadDir):
            os.makedirs(downloadDir)

        ddg.download(q, folder=downloadDir, parallel=True, max_urls=400)


### Google


**Disclaimer** In order to use this tool for download images from Google, it is necessary to download the [chromedriver](https://chromedriver.chromium.org/downloads) and put it in the same folder specified below.


In [None]:
from simple_image_download import Downloader

In [None]:
print("[💾 GOOGLE DOWNLOADER]")

for query, folder in zip(queries, classes):
    print("\n[🗃️ INFO] Downloading images for class {}".format(folder))

    classDir = os.path.join(googleFolder, folder)

    response = Downloader()
    response.directory = classDir + os.sep

    response.download(",".join(query), 400)

## Duplicate removal for queries


In [None]:
from difPy import dif

print("[🗑️ DUPLICATE REMOVAL]")

for dataset in datasets:
    print("\n" + "-" * 15)
    print("[🗃️ DATASET] {}".format(dataset))
    datasetFolder = os.path.join(datasetBaseFolder, dataset)

    for query, folder in zip(queries, classes):
        classDir = os.path.join(datasetFolder, folder)
        print("[🔍 INFO] Removing duplicates in the dataset {}".format(classDir))

        for q in query:
            print("[🔍 INFO] Removing duplicates for the query {}".format(q))

            queryDir = os.path.join(classDir, q)
            search = dif(queryDir, similarity="normal")

            for imgKey in search.result:
                print("[✔️ INFO] Found duplicates for the image {}".format(imgKey))

                duplicates = search.result[imgKey]["duplicates"]

                for duplicate in duplicates:
                    print("[🗑️ INFO] Deleted duplicate {}".format(imgKey))
                    if os.path.exists(duplicate):
                        os.remove(duplicate)
