In [1]:
import utils
import os
import numpy as np
import shutil
import math
import random

In [2]:
"""
    inputs: in_paths  ---> list of input paths, where every path corresponds to a different label
            out_paths ---> names of training, validation and test folders, in this order
              ratios  ---> what fraction of the dataset goes into training, validation and testing
              seed    ---> seed used for randomization 
"""
def create_dataset(in_paths, labels, out_paths, ratios, seed):
    if len(ratios) != 3 or len(out_paths) != 3:
        print("Output configuration is wrong")
        return

    if np.sum(ratios) > 1.0:
        print("Sum of ratios must be less than 1")
        return

    for out_path in out_paths:
        if not os.path.exists(out_path): 
            os.makedirs(out_path) 
            print(f"Created folder {out_path}")
        else:
            utils.delete_folder(out_path)
            os.makedirs(out_path) 
            print(f"Substituted folder {out_path}")
    
    for label_index, in_path in enumerate(in_paths):
        # label = in_path.split('/')[-2]
        label = labels[label_index]

        for (root, dirs, files) in os.walk(in_path, topdown=True):
            # number of elements in each split
            n_train = math.floor(ratios[0] * len(files))
            n_valid = math.floor(ratios[1] * len(files))
            n_test = len(files) - (n_train + n_valid)

            train_files = []
            valid_files = []
            test_files = []

            # create list of random indexes and shuffle it
            indexes = list(range(0, len(files)))
            random.Random(seed).shuffle(indexes)

            for j in range(0, len(files)):
                index = indexes[j]
                if j < n_train:
                    train_files.append(files[index])
                elif n_train <= j < n_train + n_valid:
                    valid_files.append(files[index])
                else:
                    test_files.append(files[index])

            for i, out_path in enumerate(out_paths):
                if not os.path.exists(out_path + label + "/"): 
                    os.makedirs(out_path + label + "/") 
                else:
                    utils.delete_folder(out_path + label + "/")
                    os.makedirs(out_path + label + "/") 

                if i == 0:
                    for filename in train_files:
                        shutil.copyfile(in_path + filename, out_path + label +  "/" + filename)
                elif i == 1:
                    for filename in valid_files:
                        shutil.copyfile(in_path + filename, out_path + label +  "/" + filename)
                else:
                    for filename in test_files:
                        shutil.copyfile(in_path + filename, out_path + label +  "/" + filename)

- Create binary dataset

In [3]:
right_segments = "/home/lorenzo/Documents/programming/mpai/manipulated-images/dataset-multiple/c/"
wrong_segments = "/home/lorenzo/Documents/programming/mpai/manipulated-images/dataset-multiple/w/"
labels = ['c', 'w']

in_paths = [right_segments, wrong_segments]

train_path = "../training-set/"
validation_path = "../validation-set/"
test_path = "../test-set/"

create_dataset(in_paths=in_paths, labels=labels, out_paths=[train_path, validation_path, test_path], ratios=[0.7, 0.2, 0.1], seed=2023)

Created folder ../training-set/
Created folder ../validation-set/
Created folder ../test-set/


- Create dataset with four classes

In [27]:
wrong_segments = "/home/lorenzo/Documents/programming/mpai/manipulated-images/dataset-nooverlap/w/"
new_dataset_path = "../dataset-four-classes-multiple/"

labels = ['double', 'half', 'quadruple', 'quarter']

utils.create_folder(new_dataset_path, overwrite=True)

in_paths = []

for label in labels:
    utils.create_folder(new_dataset_path + label + "/", overwrite=True)
    in_paths.append(new_dataset_path + label + "/")

# add spectrograms to subdirectories depending on the name of the file
for (root, dirs, files) in os.walk(wrong_segments, topdown=True):
    for filename in files:
        if "3,75ips_to15ips" in filename:
            shutil.copyfile(wrong_segments + filename, new_dataset_path + 'quadruple' +  "/" + filename)
        elif "15ips_to3,75ips" in filename:
            shutil.copyfile(wrong_segments + filename, new_dataset_path + 'quarter' +  "/" + filename)
        elif "3,75ips_to7,5ips" in filename:
            shutil.copyfile(wrong_segments + filename, new_dataset_path + 'double' +  "/" + filename)
        elif "7,5ips_to15ips" in filename:
            shutil.copyfile(wrong_segments + filename, new_dataset_path + 'double' +  "/" + filename)
        elif "15ips_to7,5ips" in filename:
            shutil.copyfile(wrong_segments + filename, new_dataset_path + 'half' +  "/" + filename)
        elif "7,5ips_to3,75ips" in filename:
            shutil.copyfile(wrong_segments + filename, new_dataset_path + 'half' +  "/" + filename)


train_path = "../training-set/"
validation_path = "../validation-set/"
test_path = "../test-set/"

create_dataset(in_paths=in_paths, labels=labels, out_paths=[train_path, validation_path, test_path], ratios=[0.7, 0.2, 0.1], seed=2023)

Created folder ../dataset-four-classes-multiple/
Created folder ../dataset-four-classes-multiple/double/
Created folder ../dataset-four-classes-multiple/half/
Created folder ../dataset-four-classes-multiple/quadruple/
Created folder ../dataset-four-classes-multiple/quarter/
Created folder ../training-set/
Created folder ../validation-set/
Created folder ../test-set/
