# Checking the overlap between the datasets

What can happen is that the datasets created have some overlap in the various sets, i.e. the images used for training in one dataset can be present in the test set of another dataset. This can affect performances of the model, and therefore we want to be able to quantify this overlap.

### Importing Libraries

In [None]:
from difPy import dif
import os

### Setting up the folder structure used

In this repo the dataset is structured in such a way that inside the datasets folder we have three subfolders for every source used (Bing, DuckDuckGo and Google) and inside each of them we can find the folders of training, validation and test set containing the post-processed images.

In [None]:
from utils.tasks import currentTask

cwd = os.getcwd()
cwd = os.path.join(cwd, "datasets")
cwd = os.path.join(cwd, currentTask)

classes = ["cat", "dog"]
datasets = ["bing", "ddg", "google"]
sets = ["test", "train", "val"]

bingFolder = os.path.join(cwd, "bing")
ddgFolder = os.path.join(cwd, "ddg")
googleFolder = os.path.join(cwd, "google")

In [None]:
# Defining folders

all_folders = []

bing_test = os.path.join(bingFolder, "test")
all_folders.append(bing_test)
bing_train = os.path.join(bingFolder, "train")
all_folders.append(bing_train)
bing_val = os.path.join(bingFolder, "val")
all_folders.append(bing_val)

ddg_test = os.path.join(ddgFolder, "test")
all_folders.append(ddg_test)
ddg_train = os.path.join(ddgFolder, "train")
all_folders.append(ddg_train)
ddg_val = os.path.join(ddgFolder, "val")
all_folders.append(ddg_val)

google_test = os.path.join(googleFolder, "test")
all_folders.append(google_test)
google_train = os.path.join(googleFolder, "train")
all_folders.append(google_train)
google_val = os.path.join(googleFolder, "val")
all_folders.append(google_val)

In [None]:
# Defining for dog and cat

cat_folders = []

cat_bing_test = os.path.join(bing_test, "cat")
cat_folders.append(cat_bing_test)
cat_bing_train = os.path.join(bing_train, "cat")
cat_folders.append(cat_bing_train)
cat_bing_val = os.path.join(bing_val, "cat")
cat_folders.append(cat_bing_val)

cat_ddg_test = os.path.join(ddg_test, "cat")
cat_folders.append(cat_ddg_test)
cat_ddg_train = os.path.join(ddg_train, "cat")
cat_folders.append(cat_ddg_train)
cat_ddg_val = os.path.join(ddg_val, "cat")
cat_folders.append(cat_ddg_val)

cat_google_test = os.path.join(google_test, "cat")
cat_folders.append(cat_google_test)
cat_google_train = os.path.join(google_train, "cat")
cat_folders.append(cat_google_train)
cat_google_val = os.path.join(google_val, "cat")
cat_folders.append(cat_google_val)

bing_cat = cat_folders[0:3]
ddg_cat = cat_folders[3:6]
google_cat = cat_folders[6:9]

all_cat = [bing_cat, ddg_cat, google_cat]

dog_folders = []

dog_bing_test = os.path.join(bing_test, "dog")
dog_folders.append(dog_bing_test)
dog_bing_train = os.path.join(bing_train, "dog")
dog_folders.append(dog_bing_train)
dog_bing_val = os.path.join(bing_val, "dog")
dog_folders.append(dog_bing_val)

dog_ddg_test = os.path.join(ddg_test, "dog")
dog_folders.append(dog_ddg_test)
dog_ddg_train = os.path.join(ddg_train, "dog")
dog_folders.append(dog_ddg_train)
dog_ddg_val = os.path.join(ddg_val, "dog")
dog_folders.append(dog_ddg_val)

dog_google_test = os.path.join(google_test, "dog")
dog_folders.append(dog_google_test)
dog_google_train = os.path.join(google_train, "dog")
dog_folders.append(dog_google_train)
dog_google_val = os.path.join(google_val, "dog")
dog_folders.append(dog_google_val)

bing_dog = dog_folders[0:3]
ddg_dog = dog_folders[3:6]
google_dog = dog_folders[6:9]

all_dog = [bing_dog, ddg_dog, google_dog]

## Check iteratively every folder

WARNING! Computationally demanding action ahead, handle with care. In order to speed up the computation, we neglected the check on folders coming from the same datasets (e.g. we don't check between 'cat_bing_val' and 'cat_bing_train' because we are supposing that this action has already been performed).

In [None]:
# Check for cats

checked_cat = []    # List containing every pair checked
results_cat = []

for dataset1 in all_cat:
    cat_tmp = all_cat.copy()
    cat_tmp.remove(dataset1)

    for folder1 in dataset1:
        for dataset2 in cat_tmp:
            for folder2 in dataset2:
                check = folder2 + folder1
                if check not in checked_cat:
                    checked_cat.append(check)
                    print("[👁️] Checking between:", folder1, "and", folder2)
                    search = dif(folder1, folder2, similarity="normal", px_size=50, sort_output=False, show_output=False, delete=False)
                    results_cat.append(search.result)

In [None]:
# Check for dogs

checked_dog = []    # List containing every pair checked
results_dog = []

for dataset1 in all_dog:
    dog_tmp = all_dog.copy()
    dog_tmp.remove(dataset1)

    for folder1 in dataset1:
        for dataset2 in dog_tmp:
            for folder2 in dataset2:
                check = folder2 + folder1
                if check not in checked_dog:
                    checked_dog.append(check)
                    print("[👁️] Checking between:", folder1, "and", folder2)
                    search = dif(folder1, folder2, similarity="normal", px_size=50, sort_output=False, show_output=False, delete=False)
                    results_dog.append(search.result)