## Full dataset

Splited by classes

In [16]:
import os
import numpy as np

In [32]:
BASE_DIR = os.path.join('.', 'data')
classes = os.listdir(BASE_DIR)
n_classes = len(classes)

In [33]:
print("Base: ", BASE_DIR)
print("Classes are: ", classes)
print("Number of classes: ", n_classes)

Base:  ./data
Classes are:  ['clock', 'telephone', 'chair']
Number of classes:  3


In [34]:
def get_files_number(path):
    counter = 0
    for file in os.listdir(path):
        if os.path.isfile(os.path.join(path, file)):
            counter += 1
    
    return counter

In [35]:
all_path = []

def provide_path(dir_list):
    for i, class_ in enumerate(dir_list):
        path = os.path.join(BASE_DIR, class_)
        all_path.append(path)    

In [36]:
provide_path(classes)

In [37]:
total_dir_images = []

def print_files_data(dir_list):
    for i in range(len(dir_list)):
        n_files = get_files_number(all_path[i])
        total_dir_images.append(n_files)
        print("Total images for", dir_list[i] ,":", n_files)        

In [38]:
print_files_data(classes)

Total images for clock : 1608
Total images for telephone : 1778
Total images for chair : 1818


## Remove redundant images

Keep 1600 images for each classe

In [39]:
from random import sample

TOTAL_IMAGE_NEEDED = 1600
for i, path in enumerate(all_path):
    for file in sample(os.listdir(path), total_dir_images[i] - TOTAL_IMAGE_NEEDED):
        os.remove(os.path.join(path,file))

In [40]:
print_files_data(classes)

Total images for clock : 1600
Total images for telephone : 1600
Total images for chair : 1600


## Train, Test, Validation

In [41]:
def move_files(files, path, dist):
    for file in files:
        full_path = os.path.join(path, file)
        shutil.move(full_path, dist)

In [42]:
def make_class_dirs(path):
    for class_ in classes:
        os.makedirs(os.path.join(path, class_), exist_ok=True)

In [43]:
import shutil
def train_test_val_split(path, test_ratio, train_dist, test_dist, val_dist):
    os.makedirs(train_dist, exist_ok=True)
    os.makedirs(test_dist, exist_ok=True)
    os.makedirs(val_dist, exist_ok=True)
    
    make_class_dirs(train_dist)
    make_class_dirs(test_dist)
    make_class_dirs(val_dist)
    
    class_ = path.split('/')[-1]
    
    all_files = os.listdir(path)
    np.random.shuffle(all_files)
    train_size = int(len(all_files)* (1 - 2*test_ratio))
    test_size = int(len(all_files)* (test_ratio))
    train_files, val_files, test_files,  = np.split(np.array(all_files),
                                       [train_size, test_size+train_size])
    
    move_files(train_files, path, os.path.join(train_dist, class_))
    move_files(test_files, path, os.path.join(test_dist, class_))
    move_files(val_files, path, os.path.join(val_dist, class_))

In [44]:
#path variable is a path to class folder
for path in all_path:
    train_test_val_split(path, 
                         .14,      # split 20% from each class for validation and test
                                   # in this case it has to be 14%
                         os.path.join(BASE_DIR, 'train'),
                         os.path.join(BASE_DIR, 'test'),
                         os.path.join(BASE_DIR, 'val'))

In [45]:
!rm -r data/{chair,telephone,clock}

In [46]:
sets = os.listdir(BASE_DIR)
for set_ in sets:
    counter = 0
    for class_ in classes:
        counter += get_files_number(os.path.join(BASE_DIR, set_, class_))
        
    print("Total images in",set_,":",counter)

Total images in val : 672
Total images in train : 3456
Total images in test : 672
