## Modifications by Stephen to handle all of the data and data splits

In [1]:
from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import torchvision
from torchvision import datasets, models, transforms, utils
import matplotlib.pyplot as plt
import time
import copy
import os

##added by Oskar
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

plt.ion()   # interactive mode

%matplotlib inline

In [2]:
## Without use of Data loader:

class Scale(object):
    """Rescales the input PIL.Image to the given 'size'.
    If 'size' is a 2-element tuple or list in the order of (width, height), it will be the exactly size to scale.
    If 'size' is a number, it will indicate the size of the smaller edge.
    For example, if height > width, then image will be
    rescaled to (size * height / width, size)
    size: size of the exactly size or the smaller edge
    interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(self, size, interpolation=Image.BILINEAR):
        assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
        self.size = size
        self.interpolation = interpolation

    def __call__(self, img):
        if isinstance(self.size, int):
            w, h = img.size
            if (w <= h and w == self.size) or (h <= w and h == self.size):
                return img
            if w < h:
                ow = self.size
                oh = int(self.size * h / w)
                return img.resize((ow, oh), self.interpolation)
            else:
                oh = self.size
                ow = int(self.size * w / h)
                return img.resize((ow, oh), self.interpolation)
        else:
            return img.resize(self.size, self.interpolation)



In [3]:
# Split data, but only samples that are from the original dataset

SPLIT_DATA = False
if SPLIT_DATA:

    import os
    from shutil import copyfile

    # Define parameters of the split
    prop_train = 0.7
    prop_val = 0.2
    prop_test = 0.1
    
    def get_indices(num_samples, prop_train = 0.7, prop_dev = 0.2, prop_test = 0.1):
    
        assert (prop_train + prop_dev + prop_test - 1.0 < 1e-5)

        num_train_samples = int(np.floor(num_samples * prop_train))
        num_dev_samples = int(np.floor(num_samples * prop_dev))
        num_test_samples = int(np.ceil(num_samples * prop_test))

#         assert (num_train_samples + num_dev_samples + num_test_samples) == num_samples
        
        indices = list(range(num_samples))
        np.random.seed(123)
        np.random.shuffle(indices)

        train_indices = indices[0:num_train_samples]
        dev_indices = indices[num_train_samples:num_train_samples + num_dev_samples]
        test_indices = indices[num_train_samples + num_dev_samples:]
        return train_indices, dev_indices, test_indices

    def maybe_makedir(dirname):
        if not os.path.exists(dirname):
            os.makedirs(dirname)

    # Define directories
    root_dir = '../../data/all_data'
#     train_path = root_dir + '/train'
    train_path = root_dir
    split_path = root_dir + '/split_new'

    # Set the random seed
    np.random.seed(456)

    # Create split path if it does not exist
    if not os.path.exists(split_path):
        os.makedirs(split_path)

    class_names = ('Type_1', 'Type_2', 'Type_3')
    for class_name in class_names:

        # Define the target location for the split class path
        split_train_class_path = split_path + '/' + 'train' + '/' + class_name
        split_val_class_path = split_path + '/' + 'val' + '/' + class_name
        split_test_class_path = split_path + '/' + 'test' + '/' + class_name
        
        # Make directories if they dont already exist
        maybe_makedir(split_train_class_path)
        maybe_makedir(split_val_class_path)
        maybe_makedir(split_test_class_path)

        # Directory where original training data are stored
        train_class_path = train_path + '/' + class_name

        # Get list of files in train directory
        train_files = os.listdir(train_class_path)
        
#         train_files_clean = filter(lambda k: 'additional' not in k, train_files)
        train_files_clean = [k for k in train_files if 'additional' not in k]
        train_files_additional = [k for k in train_files if 'additional' in k]
#         print(train_files_clean)
        
        train_indices, val_indices, test_indices = get_indices(len(train_files_clean), prop_train, prop_val, prop_test)
        print((len(train_indices), len(val_indices), len(test_indices), len(train_files)))

        # Copy the train files to split directory
        for index in train_indices:
            src_path = train_class_path + '/' + train_files_clean[index]
            dest_path = split_train_class_path + '/' + train_files_clean[index]
            copyfile(src_path, dest_path)
        # Copy all additional files to train directory
        for additional_name in train_files_additional:
            src_path = train_class_path + '/' + additional_name
            dest_path = split_train_class_path + '/' + additional_name
            copyfile(src_path, dest_path)
        # Copy the val files to split directory
        for index in val_indices:
            src_path = train_class_path + '/' + train_files_clean[index]
            dest_path = split_val_class_path + '/' + train_files_clean[index]
            copyfile(src_path, dest_path)
        # Copy the test files to split directory
        for index in test_indices:
            src_path = train_class_path + '/' + train_files_clean[index]
            dest_path = split_test_class_path + '/' + train_files_clean[index]
            copyfile(src_path, dest_path)

In [4]:
# test_folder = False
# if test_folder:
#     transform_scale = Scale(224)
# else:
#     transform_scale = Scale(256)

In [7]:
#For splits that have already been defined
home = os.path.expanduser('~')

load_path = home + '/data/all_data/split_new/'
# load_path = home + '/data/train/'
save_path = home + '/data/all_data_scaled_alt/'

classes = ['Type_1/', 'Type_2/', 'Type_3/']
splits = ['train/', 'val/', 'test/']
# splits = ['test/']

for split in splits:
    # Get data split
    if split == 'train/':
        transform_scale = Scale(299)
    else:
        transform_scale = Scale(299)
   
    for c in classes:    
        load_path_class = load_path + split + c
        save_path_class = save_path + split + c
        
        # Make directories
        if not os.path.exists(save_path + split):
            os.makedirs(save_path + split)
        if not os.path.exists(save_path_class):
            os.makedirs(save_path_class)
            
        directory = os.fsencode(load_path_class)

        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            if filename.endswith(".jpg"): 
#                 print(load_path_class + filename)
                im = Image.open(load_path_class + filename)
                img = transform_scale(im)
                img.save(save_path_class + filename)        
