# Datasets Downloader

Download images from three different sources: Bing, DuckDuckGo, Google

## Setup

In [1]:
import os
import DuckDuckGoImages as ddg
from google_images_download import google_images_download
from difPy import dif

### Settings

In [None]:
datasetBaseFolder = "../tmp"
classes = ["cat", "dog"]
datasets = ["bing", "ddg", "google"]

bingFolder = os.path.join(datasetBaseFolder, "bing")
ddgFolder = os.path.join(datasetBaseFolder, "ddg")
googleFolder = os.path.join(datasetBaseFolder, "google")

In [None]:
queries = [
    ["cat", "little cat", "small cat", "calico cat", "cute cat", "norwegian forest cat", "black cat",
        "orange cat", "grey cat", "white cat", "fluffy cat", "siamese cat", "tabby cat",
        "munchkin cat", "persian cat", "angora cat", "bengal cat", "chartreux cat", "savannah cat", "ragdoll cat"],

    ["dog", "little dog", "small dog", "brown dog", "cute dog",
     "big dog", "black dog", "boxer dog", "grey dog", "white dog",
     "german shepherd dog", "golden retriever dog", "labrador dog", "samoyed dog", "siberian husky dog",
     "chihuahua dog", "bulldog", "doberman dog", "pug dog", "rottweiler dog"]
]


### Create temporary folders where download images

In [None]:
for dataset in datasets:
    dataFolder = os.path.join(datasetBaseFolder, dataset)

    for cls in classes:
        clsFolder = os.path.join(dataFolder, cls)

        if not os.path.exists(clsFolder):
            print("[📂 CREATED FOLDER] {}".format(clsFolder))
            os.makedirs(clsFolder)


## Downloader

### Bing

In [None]:
print("[💾 BING DOWNLOADER]")

for query, folder in zip(queries, classes):
    print("\n[🗃️ INFO] Downloading images for class {}".format(folder))

    classDir = os.path.join(bingFolder, folder)

    for q in query:
        print("[🔍 INFO] Downloading images for query {}".format(q))
        
        downloadDir = os.path.join(classDir, q)
        os.system(
            "python3 bbid.py -s \"{}\" -o \"{}\" --limit 400".format(query, downloadDir))

### DuckDuckGo

In [None]:
print("[💾 DDG DOWNLOADER]")

for query, folder in zip(queries, classes):    
    print("\n[🗃️ INFO] Downloading images for class {}".format(folder))

    classDir = os.path.join(ddgFolder, folder)

    for q in query:
        print("[🔍 INFO] Downloading images for query {}".format(q))
        
        downloadDir = os.path.join(classDir, q)
        if not os.path.exists(downloadDir):
            os.makedirs(downloadDir)

        ddg.download(q, folder=downloadDir, parallel=True, max_urls=2000)


### Google

**Disclaimer** In order to use this tool for download images from Google, it is necessary to download the [chromedriver](https://chromedriver.chromium.org/downloads) and put it in the same folder specified below.

In [None]:
response = google_images_download.googleimagesdownload()
chromedriver_dir = "C:\chromedriver.exe"

In [None]:
print("[💾 GOOGLE DOWNLOADER]")

for query, folder in zip(queries, classes):
    print("\n[🗃️ INFO] Downloading images for class {}".format(folder))

    classDir = os.path.join(googleFolder, folder)

    arguments = {
        "keywords": ",".join(query),
        "limit": 400,
        "chromedriver": chromedriver_dir,
        "output_directory": classDir
    }

    paths = response.download(arguments)

## Duplicate removal for queries

In [None]:
print("[🗑️ DUPLICATE REMOVAL]")

for dataset in datasets:
    print("\n[🗃️ DATASET] {}".format(dataset))
    datasetFolder = os.path.join(datasetBaseFolder, dataset)

    for query, folder in zip(queries, classes):
        classDir = os.path.join(datasetFolder, folder)
        print("[🔍 INFO] Removing duplicates in the dataset {}".format(classDir))

        for q in query:
            print("[🔍 INFO] Removing duplicates for the query {}".format(q))

            queryDir = os.path.join(classDir, q)
            search = dif(queryDir, similarity="normal")

            for imgKey in search.result:
                print("[✔️ INFO] Found duplicates for the image {}".format(imgKey))
                
                duplicates = search.result[imgKey]["duplicates"]

                for duplicate in duplicates:
                    print("[🗑️ INFO] Deleted duplicate {}".format(imgKey))
                    if os.path.exists(duplicate):
                        os.remove(duplicate)
