# Preprocessing

In [None]:
from PIL import Image
import os
import shutil
from difPy import dif

## Settings

In [None]:
CAT_DOG = "catDog"
MAN_WOMAN = "manWoman"
BIKE_MOTORBIKE = "bikeMotorbike"

datasetFolder = "../tmp/" + MAN_WOMAN
resizeOutput = (300, 300)
IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp',
                  '.pgm', '.tif', '.tiff', '.webp')

finalDatasetFolder = "../datasets"
folderOrganization = {
    "train": 3500,
    "val": 500,
    "test": 1000
}

## Resizer

In [None]:
for root, _, fnames in sorted(os.walk(datasetFolder, followlinks=True)):
    for fname in sorted(fnames):
        path = os.path.join(root, fname)
        fileName, extension = os.path.splitext(path)

        if not extension.lower() in IMG_EXTENSIONS:
            print("[🗑️ INVALID EXTENSION FOUND] {}".format(path))
            os.remove(path)
        else:
            try:
                img = Image.open(path).convert("RGB")
                img = img.resize(resizeOutput, Image.ANTIALIAS)
                
                print("[💾 SAVING NEW FILE IN JPEG] {}".format(path))
                img.save(fileName + ".jpg", "JPEG")
                img.close()

                if extension != ".jpg":
                    os.remove(path)
            except:
                print("[🗑️ INVALID FILE FOUND] {}".format(path))
                os.remove(path)


## Folder organization

In [None]:
datasets = [x for x in os.listdir(datasetFolder) if os.path.isdir(os.path.join(datasetFolder, x))]

### Create folders

In [None]:
for dataset in datasets:
    dataFolder = os.path.join(finalDatasetFolder, dataset)
    for folder in folderOrganization:
        orgFolder = os.path.join(dataFolder, folder)
        if not os.path.exists(orgFolder):
            os.makedirs(orgFolder)

### Extract files

In [None]:
print("[🛩️ EXTRACTING FILES FROM QUERIES FOLDERS]")

for dataset in datasets:
    print("\n" + "-" * 15)
    print("[🗃️ DATASET] {}".format(dataset))
    dataFolder = os.path.join(datasetFolder, dataset)

    for cls in [x for x in os.listdir(dataFolder) if os.path.isdir(os.path.join(dataFolder, x))]:
        print("\n[🧮 CLASS] {}".format(cls))
        clsFolder = os.path.join(dataFolder, cls)

        for query in [x for x in os.listdir(clsFolder) if os.path.isdir(os.path.join(clsFolder, x))]:
            queryFolder = os.path.join(clsFolder, query)

            for file in os.listdir(queryFolder):
                path = os.path.join(queryFolder, file)
                try:
                    shutil.move(path, clsFolder)
                except:
                    os.remove(path)
            
            os.removedirs(queryFolder)

### Duplicate removal

In [None]:
print("[🗑️ REMOVING DUPLICATES]")

for dataset in datasets:
    print("\n" + "-" * 15)
    print("[🗃️ DATASET] {}".format(dataset))
    dataFolder = os.path.join(datasetFolder, dataset)

    for cls in os.listdir(dataFolder):
        print("\n[🧮 CLASS] {}".format(cls))
        clsFolder = os.path.join(dataFolder, cls)

        print("[🔍 INFO] Removing duplicates in the dataset {}".format(clsFolder))

        search = dif(clsFolder, similarity="normal")

        for imgKey in search.result:
            print("[✔️ INFO] Found duplicates for the image {}".format(imgKey))

            duplicates = search.result[imgKey]["duplicates"]

            for duplicate in duplicates:
                print("[🗑️ INFO] Deleted duplicate {}".format(duplicate))
                if os.path.exists(duplicate):
                    os.remove(duplicate)

### Move files in folders

In [None]:
print("[🛩️ CREATING FINAL DATASETS]")

for dataset in datasets:
    dataFolder = os.path.join(datasetFolder, dataset)
    finalDataFolder = os.path.join(finalDatasetFolder, dataset)

    print("\n" + "-" * 15)
    print("[🗃️ DATASET] {}".format(dataset))

    for folder in folderOrganization:
        print("\n[📂 FOLDER] {}".format(folder))
        folderFolder = os.path.join(finalDataFolder, folder)

        for cls in os.listdir(dataFolder):
            print("[🧮 CLASS] {}".format(cls))
            clsFolder = os.path.join(dataFolder, cls)
            finalClsFolder = os.path.join(folderFolder, cls)
            
            if not os.path.exists(finalClsFolder):
                os.makedirs(finalClsFolder)

            files = [x for x in os.listdir(clsFolder)
                        if os.path.isfile(os.path.join(clsFolder, x))]
            filesToMove = files[:folderOrganization[folder]]

            for fileToMove in filesToMove:
                shutil.move(os.path.join(clsFolder, fileToMove), os.path.join(finalClsFolder, fileToMove))
