In [1]:
import os
import cv2
from PIL import Image
import json
import shutil

In [1]:
inputPath = r"../../../headsegmentation_dataset_ccncsa/samples/"
labelsPath = r"../../../headsegmentation_dataset_ccncsa/labels"
outputPath = r"../../../headsegmentation_dataset_ccncsa/samplesArranged"

In [22]:
labelsDist = sorted(os.listdir(labelsPath))

In [23]:
print(labelsDist)

['female03', 'female10', 'female23', 'femalealison1', 'femalealison2', 'femalebarbera', 'femalebarbera2', 'femalecarla', 'femalecarla2', 'femalefelice', 'femalejoyce', 'femalejoyce2', 'femalelaura', 'femalelaura2', 'femaleroberta', 'male01', 'male06_1', 'male06_2', 'male09', 'male23', 'maleandrew', 'maleandrew2', 'malebruce', 'malebruce2', 'malecarlos', 'malecarlos2', 'malecorry', 'malecorry2', 'maleelias', 'maleelias2', 'malegaberial', 'malegaberial2', 'malekumar', 'maleshawn', 'multiperson', 'real']


In [24]:
for folder in labelsDist:
    os.mkdir(os.path.join(outputPath, folder))

### Explore and Organize Dataset

In [46]:
folderTypes = {"female": [], "male": [], "multiperson": [], "real": []}

for folder in os.listdir(inputPath):
    if folder.startswith("female"):
        folderTypes["female"].append(folder.split("_")[0])
        if os.path.isdir(os.path.join(outputPath, folderTypes["female"][-1])):
            for file in os.listdir(os.path.join(inputPath, folder)):
                shutil.copy(os.path.join(inputPath, folder, file), os.path.join(outputPath, folderTypes["female"][-1], file))
    
    elif folder.startswith("multiperson"):
        folderTypes["multiperson"].append(folder.split("_")[0])
        if os.path.isdir(os.path.join(outputPath, folderTypes["multiperson"][-1])):
            for file in os.listdir(os.path.join(inputPath, folder)):
                shutil.copy(os.path.join(inputPath, folder, file), os.path.join(outputPath, folderTypes["multiperson"][-1], file))
    
    elif folder.startswith("male"):
        
        folderTypes["male"].append(folder.split("_")[0])
        if folder.startswith("male06") and "nolight" in folder:
            outFolder = "male06_2"
        
        elif folder.startswith("male06"):
            outFolder = "male06_1"
        
        else:
            outFolder = folderTypes["male"][-1]
        if os.path.isdir(os.path.join(outputPath, outFolder)):
            for file in os.listdir(os.path.join(inputPath, folder)):
                shutil.copy(os.path.join(inputPath, folder, file), os.path.join(outputPath, outFolder, file))

    else:
        if folder.startswith("real") and os.path.isdir(os.path.join(outputPath, "real")):
            for file in os.listdir(os.path.join(inputPath, folder)):
                shutil.copy(os.path.join(inputPath, folder, file), os.path.join(outputPath, "real", file))


In [49]:
for folder in labelsDist:
    if len(os.listdir(os.path.join(labelsPath, folder))) != len(os.listdir(os.path.join(outputPath, folder))):
        print(folder, len(os.listdir(os.path.join(labelsPath, folder))), len(os.listdir(os.path.join(outputPath, folder))))

female23 501 334
femalealison1 501 167
femalealison2 501 167
femalebarbera 501 334
femalecarla 501 334
femalefelice 501 251
femalejoyce 501 334
femalelaura 501 334
male06_1 351 284
male06_2 501 393
male09 351 117
male23 501 167
maleandrew 501 334
malebruce 501 334
malecarlos 501 334
malecarlos2 501 251
malecorry 501 334
maleelias 501 333
malegaberial 501 334
maleshawn 501 334
real 2505 2500


In [None]:
import matplotlib.pyplot as plt

allSamples = []
_id = 0
for folder in os.listdir(outputPath):
    for file in os.listdir(os.path.join(outputPath, folder)):
        if file in os.listdir(os.path.join(labelsPath, folder)):
            image = Image.open(os.path.join(outputPath, folder, file))
            mask = Image.open(os.path.join(labelsPath, folder, file))
            
            try:
                assert image.size == mask.size
                allSamples.append({"id": _id, "folder": folder, "Filename": file, "size": image.size})
                _id += 1
            except AssertionError:
                print(folder, file, image.size, mask.size)
    
#     f, axarr = plt.subplots(2,2)
#     axarr[0,0].imshow(image)
#     axarr[0,1].imshow(mask)
#     plt.show()

In [62]:
import pandas as pd

dataSamples = pd.DataFrame(allSamples, columns=allSamples[0].keys())
dataSamples.to_csv("FinalSamples.csv", index=False)

### Move to dataset folder

In [67]:
finalPath = r"../../faceSegmentation/dataset/"

for row, sample in dataSamples.iterrows():
    imagePath = os.path.join(outputPath, sample["folder"], sample["Filename"])
    maskPath = os.path.join(labelsPath, sample["folder"], sample["Filename"])
    
    shutil.copy(imagePath, os.path.join(finalPath, "images", sample["folder"] + "_" + sample["Filename"]))
    shutil.copy(maskPath, os.path.join(finalPath, "masks", sample["folder"] + "_" + sample["Filename"]))
    

### Remove near duplicate images

In [3]:
from imutils import paths
import numpy as np
import argparse
import cv2
import os

In [4]:
def dhash(image, hashSize=8):
    # convert the image to grayscale and resize the grayscale image,
    # adding a single column (width) so we can compute the horizontal
    # gradient
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(gray, (hashSize + 1, hashSize))
    # compute the (relative) horizontal gradient between adjacent
    # column pixels
    diff = resized[:, 1:] > resized[:, :-1]
    # convert the difference image to a hash and return it
    return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

In [5]:
print("[INFO] computing image hashes...")
imagePaths = list(paths.list_images("../../dataset/images"))
hashes = {}
# loop over our image paths
for imagePath in imagePaths:
    # load the input image and compute the hash
    image = cv2.imread(imagePath)
    h = dhash(image)
    # grab all image paths with that hash, add the current image
    # path to it, and store the list back in the hashes dictionary
    p = hashes.get(h, [])
    p.append(imagePath)
    hashes[h] = p

[INFO] computing image hashes...


In [6]:
remove = True

# loop over the image hashes
for (h, hashedPaths) in hashes.items():
    # check to see if there is more than one image with the same hash
    if len(hashedPaths) > 1:
        # check to see if this is a dry run
        if not remove:
            # initialize a montage to store all images with the same
            # hash
            montage = None
            # loop over all image paths with the same hash
            for p in hashedPaths:
                # load the input image and resize it to a fixed width
                # and heightG
                image = cv2.imread(p)
                image = cv2.resize(image, (150, 150))
                # if our montage is None, initialize it
                if montage is None:
                    montage = image
                # otherwise, horizontally stack the images
                else:
                    montage = np.hstack([montage, image])
            # show the montage for the hash
            print("[INFO] hash: {}".format(h))
            cv2.imshow("Montage", montage)
            cv2.waitKey(0)
        # otherwise, we'll be removing the duplicate images
        else:
            # loop over all image paths with the same hash *except*
            # for the first image in the list (since we want to keep
            # one, and only one, of the duplicate images)
            for p in hashedPaths[1:]:
                os.remove(p)
                os.remove(p.replace("/images/", "/masks/"))

In [7]:
print(len(os.listdir(os.path.join("../../dataset/images"))))

14464


In [27]:
allFilesPath = r"../../faceSegmentation/dataset/"

subSamplesPath = r"../../dataset/subsamples/"

### Create sub sample of datset for training

In [12]:
oldFolders = {}

for imageFile, maskFile in zip(os.listdir(os.path.join(allFilesPath, "images")), os.listdir(os.path.join(allFilesPath, "masks"))):
    if imageFile == maskFile:
        try:
            if oldFolders.get(imageFile.split("_")[0]):
                oldFolders[imageFile.split("_")[0]] += 1
            else:
                oldFolders[imageFile.split("_")[0]] = 1
        except:
            oldFolders[imageFile.split("_")[0]] = 1
            
    else:
        print(imageFile)

In [13]:
oldFolders

{'femalelaura': 334,
 'malecarlos': 334,
 'malecorry2': 501,
 'malebruce': 334,
 'femalecarla': 334,
 'femalebarbera': 334,
 'female03': 501,
 'malecorry': 334,
 'femalejoyce2': 501,
 'femalecarla2': 501,
 'maleelias2': 501,
 'malebruce2': 501,
 'maleandrew': 334,
 'malecarlos2': 251,
 'male01': 501,
 'femaleroberta': 501,
 'maleelias': 333,
 'multiperson': 501,
 'malegaberial2': 250,
 'femalebarbera2': 501,
 'male06': 608,
 'real': 1723,
 'malegaberial': 334,
 'maleshawn': 334,
 'femalelaura2': 501,
 'malekumar': 501,
 'femalejoyce': 334,
 'female23': 334,
 'femalefelice': 251,
 'female10': 501,
 'femalealison2': 167,
 'maleandrew2': 251,
 'femalealison1': 167,
 'male23': 167,
 'male09': 109}

### Add all files from multiperson and Real images, 10 files from other types

In [39]:
import random

addedFiles = {k: 0 for k in oldFolders.keys()}

allImages = os.listdir(os.path.join(allFilesPath, "images"))
random.shuffle(allImages)

for image in allImages:
    if image.split("_")[0] == "real" or (image.split("_")[0] == "multiperson" and addedFiles[image.split("_")[0]]<50) or addedFiles[image.split("_")[0]]<10:
        addedFiles[image.split("_")[0]] += 1
        shutil.copy(os.path.join(allFilesPath, "images", image), os.path.join(subSamplesPath, "images", image))
        shutil.copy(os.path.join(allFilesPath, "masks", image), os.path.join(subSamplesPath, "masks", image))

In [40]:
addedFiles

{'femalelaura': 10,
 'malecarlos': 10,
 'malecorry2': 10,
 'malebruce': 10,
 'femalecarla': 10,
 'femalebarbera': 10,
 'female03': 10,
 'malecorry': 10,
 'femalejoyce2': 10,
 'femalecarla2': 10,
 'maleelias2': 10,
 'malebruce2': 10,
 'maleandrew': 10,
 'malecarlos2': 10,
 'male01': 10,
 'femaleroberta': 10,
 'maleelias': 10,
 'multiperson': 50,
 'malegaberial2': 10,
 'femalebarbera2': 10,
 'male06': 10,
 'real': 1723,
 'malegaberial': 10,
 'maleshawn': 10,
 'femalelaura2': 10,
 'malekumar': 10,
 'femalejoyce': 10,
 'female23': 10,
 'femalefelice': 10,
 'female10': 10,
 'femalealison2': 10,
 'maleandrew2': 10,
 'femalealison1': 10,
 'male23': 10,
 'male09': 10}

In [43]:
newFolders = {}

for imageFile, maskFile in zip(os.listdir(os.path.join(subSamplesPath, "images")), os.listdir(os.path.join(subSamplesPath, "masks"))):
    if imageFile == maskFile:
        try:
            if newFolders.get(imageFile.split("_")[0]):
                newFolders[imageFile.split("_")[0]] += 1
            else:
                newFolders[imageFile.split("_")[0]] = 1
        except:
            newFolders[imageFile.split("_")[0]] = 1
            
    else:
        print(imageFile)

        
print("NEW DISTRIBUTION OF SAMPLES: ", len(os.listdir(os.path.join(subSamplesPath, "images"))))
print(newFolders)

NEW DISTRIBUTION OF SAMPLES:  2103
{'real': 1723, 'femalecarla2': 10, 'femalecarla': 10, 'male06': 10, 'maleelias': 10, 'multiperson': 50, 'malegaberial2': 10, 'maleandrew': 10, 'malecarlos': 10, 'female03': 10, 'femalebarbera2': 10, 'femalebarbera': 10, 'malekumar': 10, 'malecorry2': 10, 'male23': 10, 'malegaberial': 10, 'femalelaura': 10, 'maleshawn': 10, 'female10': 10, 'male09': 10, 'femalefelice': 10, 'malecorry': 10, 'malecarlos2': 10, 'femalejoyce': 10, 'femalejoyce2': 10, 'malebruce2': 10, 'femalealison1': 10, 'femaleroberta': 10, 'femalelaura2': 10, 'female23': 10, 'male01': 10, 'femalealison2': 10, 'maleandrew2': 10, 'malebruce': 10, 'maleelias2': 10}
