## Save Preprocessed Data


This notebook preprocesses and applies random crops to the data before saving each image. 

TODO: Add original image name to the .csv file

In [30]:
import numpy as np
import skimage.io
import torch
from torchvision.transforms import ToTensor, Lambda, RandomCrop
from PIL import Image
import pandas as pd
import torchvision.transforms as T
import matplotlib.pyplot as plt
from scipy import stats

from scipy.ndimage import gaussian_filter
from scipy.ndimage.measurements import center_of_mass

In [31]:
### FOR PREPROCESSING ####

In [32]:
def set_rand_seed(seed=10):
    '''
    Creates random seed for each library so that randomness is repeatable. Initialized first to set all randomness
    '''
    np.random.seed(seed)
    torch.manual_seed(seed)

def save(sample, mask, i, sample_name, sample_address, cropped):
    """
    Saves file in sim_data folder with iterating number to correspond to number of samples desired as pytorch tensors
    This file can deal with either cropped or original images, and it will sort for each one.
    In our case, cropped means that it had the n number of random crops, and they are all saved
    in a list together. 
    """

    if cropped:
        #All randomly cropped images will come in a list. This parses through the list and 
        #deals with each new image
        for j in range (len(sample)):
            
            #Converting to tensors
            input_tensor = torch.tensor(sample[j])
            mask_tensor = torch.tensor(mask[j])
            
            #Saving each image
            file_name = f"/nsls2/users/maire1/unet/data/preprocessed_data/img{i}.pt"
            torch.save({"input": input_tensor, "target": mask_tensor}, file_name)
            
            #Adding each name and address to be ultimately saved in .csv file
            sample_name.append(f"img{i}.pt")
            sample_address.append(file_name)

    else:
        #Convert to tensor
        input_tensor = torch.Tensor(sample)
        mask_tensor = torch.Tensor(mask)
        
        #Saving each image
        file_name = f"/nsls2/users/maire1/unet/data/preprocessed_data/img{i}.pt"
        torch.save({"input": input_tensor, "target": mask_tensor}, file_name)
        
        #Adding each name and address to be ultimately saved in .csv file
        sample_name.append(f"img{i}.pt")
        sample_address.append(file_name)

    return(sample_name, sample_address)

def save_total_data(name, address, cropped):
    """
    Saving titles and sample addresses into a separate csv file for use in the neural network.
    """

    #All randomly cropped images will come in a list. This parses through the list and 
    #deals with each new image
    d = {'sample': name, 'address': address}
    filename = '/nsls2/users/maire1/unet/data/preprocessed_data/img_address.csv'

    #Convert to pandas dataframe and save
    df = pd.DataFrame(data=d)
    df.to_csv(filename, index = False)
    print("All samples completed. Data saved.")

In [33]:
def crop(orig_img, target, sample_num):
    '''
    Applies random crop to the original image according to a set size
    Outputs: crops and mask crops
    '''
    set_rand_seed()
    
    #convert to tensors
    pt_img = torch.tensor(orig_img)
    pt_target = torch.tensor(target)
    cropper = T.RandomCrop(size=(256, 256))
    
    #ERROR HERE - TODO
    crops = [cropper(pt_img) for j in range(sample_num)]
    
    set_rand_seed() #Need to set random seed twice so that the mask and input have the same crop
    mask_crops = [cropper(pt_target) for j in range(sample_num)]
    #plot(pt_img, crops)
    #plot(pt_target, mask_crops)
    return crops, mask_crops

def plot(orig_img, imgs, with_orig=True, row_title=None, **imshow_kwargs):
    '''This function allows for us to be able to view each image'''
    if not isinstance(imgs[0], list):
        # Make a 2d grid even if there's just 1 row
        imgs = [imgs]

    num_rows = len(imgs)
    num_cols = len(imgs[0]) + with_orig
    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
    for row_idx, row in enumerate(imgs):
        row = [orig_img] + row if with_orig else row
        for col_idx, img in enumerate(row):
            ax = axs[row_idx, col_idx]
            ax.imshow(np.asarray(img), **imshow_kwargs)
            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])

    if with_orig:
        axs[0, 0].set(title='Original image')
        axs[0, 0].title.set_size(8)
    if row_title is not None:
        for row_idx in range(num_rows):
            axs[row_idx, 0].set(ylabel=row_title[row_idx])

    plt.tight_layout()
    


In [45]:
def resize(x, n):
    xr = np.zeros((x.shape[0]//n,x.shape[1]//n ))
    for j in range(0, x.shape[0]-n, n):
        for i in range(0, x.shape[1]-n, n):
            xr[j//n, i//n] = np.nanmean(x[j:j+n, i:i+n])
    return xr

# I haven't checked it yet. But the idea should be to selec the nearest label
# that is not a background
def resize_target(x, n): 
    xr = np.zeros((x.shape[0]//n,x.shape[1]//n ))
    for j in range(0, x.shape[0]-n, n):
        for i in range(0, x.shape[1]-n, n):
            xr[j//n, i//n] = np.nanmax(x[j:j+n, i:i+n])
    return xr

def transform(img, target):    
    #Most of our tansforms should only be applied to the input image, not the target. 
    #print(img.shape)
    if target:
        y = resize(img, 2)
    else:
        y = resize(img, 2)
        y_smooth = gaussian_filter(y, sigma=2)
        
        #ADDED from another place. May need to change order of these
        y_log = np.log(y_smooth + 0.5)
        y_log = np.nan_to_num(y_log)
        y_log = np.where(np.abs(y_log) > 1000, 0, y_log)
        min_val = np.min(y_log.flatten())
        box_cox, l = stats.boxcox(y_log.flatten()+np.abs(min_val)+1)
        
        #This may also need to be changed to account for additional preprocessing methods
        y = (y - y_smooth.min())/(y_smooth.max() - y_smooth.min())
    return y

In [46]:
#Opening file with list of addresses for data
datafile = '/nsls2/users/maire1/unet/data/cropped_data/data_address.csv'
file = pd.read_csv(datafile)
partition = file['address']# IDs
labels = file['sample']# Labels
#name = file['original file name']
sample_num = 10

#For creating csv file
#sample_name = []
cropped_sample_name = []
#sample_address = []
cropped_sample_address = []
#orig_name = []

size = (256,256)
j = 0
for i in labels:
    
    data = torch.load(f'/nsls2/users/maire1/unet/data/cropped_data/{i}')
    #data = torch.load(i)
    img = data['input'].numpy()
    img_mask = data['target'].numpy()
    img = transform(img, target=False)
    img_mask = transform(img_mask, target=True)
#     plt.imshow(img)
#     plt.show()

    #orig_img = img['input']
    #target = img['target']
    
    #sample_name, sample_address = save(img, img_mask, j, sample_name, sample_address, False)
    crops, mask_crops = crop(img, img_mask, sample_num)
    cropped_sample_name, cropped_sample_address = save(crops, mask_crops, j, cropped_sample_name, cropped_sample_address, True)
    j += 1
    
#save_total_data(sample_name, sample_address, orig_name, False)
save_total_data(cropped_sample_name, cropped_sample_address, orig_name, True)

  input_tensor = torch.tensor(sample[j])
  mask_tensor = torch.tensor(mask[j])


RuntimeError: random_ expects 'from' to be less than 'to', but got from=0 >= to=0