# Data processing

In [105]:
#|code-fold: true
#|output: false
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import numpy as np
from pathlib import Path
import torch
import torch.optim as optim
import torch.nn as nn
import platform
from PIL import Image
import datetime
import matplotlib.pyplot as plt
from matplotlib import cm
from torch.utils.data import DataLoader, Dataset, random_split, WeightedRandomSampler, SubsetRandomSampler
from torchvision.transforms import Compose, ToTensor, Normalize, ToPILImage, RandomHorizontalFlip, Resize

from step_by_step import StepByStep
plt.style.use('fivethirtyeight')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


We have accumulated labels in the `data\all_labels` folder. We need to:
- load all of them into NCWH tensors
- split them into train and valid (test labels will be separate), 
- create temporary Datasets and normalizer
- create real Datasets and DataLoaders. 

## Generate tensors

Let's first create a `x_tensor/y_tensor` from all_labels:

In [62]:
proj_dir = Path('.').resolve().parent
data_dir = proj_dir / 'data'
train_imgs_dir = data_dir / 'all_labels/type_1/imgs'
train_masks_dir = data_dir / 'all_labels/type_1/masks'
image_paths = sorted(list(train_imgs_dir.glob('*.png')))
mask_paths = sorted(list(train_masks_dir.glob('*.png')))

In [63]:
len(image_paths)

162

to go from a path to tensor use `torchvision.transforms.ToTensor()`:

In [91]:
image_tensor = ToTensor()(Image.open(image_paths[0]))
image_tensor.shape

torch.Size([3, 256, 256])

To stack many images, use `torch.stack` (not the most memory efficient but it's ok):

In [88]:
tensorizer = ToTensor()
x_tensor = []
y_tensor = []

for image_path, mask_path in zip(image_paths, mask_paths):
    # load an image into a tensor and store in x_tensor, hopefully not too big:
    image_tensor = tensorizer(Image.open(image_path))
    mask_tensor = tensorizer(Image.open(image_path))
    
    x_tensor.append(image_tensor)
    y_tensor.append(mask_tensor)

x_tensor = torch.stack(x_tensor)
y_tensor = torch.stack(y_tensor)

In [90]:
x_tensor.shape

torch.Size([161, 3, 256, 256])

## Split into train and valid

We'll use `torch.utils.data.random_split`:

In [94]:
torch.manual_seed(13)  # Important for consistency
N = len(x_tensor)
n_train = int(.8*N)
n_val = N - n_train
train_subset, val_subset = random_split(x_tensor, [n_train, n_val])

train_idx = train_subset.indices
val_idx = val_subset.indices

print(train_idx)
print(val_idx)

[115, 113, 147, 107, 116, 139, 17, 108, 128, 3, 148, 137, 143, 135, 62, 2, 33, 118, 42, 111, 34, 144, 1, 41, 5, 28, 106, 72, 69, 51, 124, 50, 103, 29, 157, 79, 67, 45, 80, 15, 37, 43, 76, 100, 0, 8, 89, 81, 95, 75, 159, 150, 20, 145, 25, 102, 54, 88, 141, 60, 120, 23, 112, 52, 85, 10, 53, 7, 155, 49, 73, 83, 68, 39, 4, 27, 47, 9, 142, 12, 55, 19, 133, 160, 110, 154, 87, 152, 97, 138, 117, 74, 71, 130, 101, 30, 22, 134, 91, 64, 156, 61, 70, 122, 35, 11, 26, 94, 149, 66, 136, 93, 59, 92, 18, 132, 123, 46, 48, 21, 36, 114, 158, 57, 38, 105, 127, 125]
[119, 31, 78, 90, 65, 99, 82, 126, 40, 77, 32, 16, 44, 109, 104, 146, 140, 6, 98, 14, 129, 151, 58, 63, 131, 96, 24, 56, 86, 84, 121, 13, 153]


In [114]:
x_train_tensor = x_tensor[train_idx]
y_train_tensor = y_tensor[train_idx]

x_val_tensor = x_tensor[val_idx]
y_val_tensor = y_tensor[val_idx]

In [115]:
print(x_train_tensor.shape)
print(x_val_tensor.shape)

torch.Size([128, 3, 256, 256])
torch.Size([33, 3, 256, 256])


## Temporary Datasets

Our very simple dataset with transform:

In [116]:
class TransformedTensorDataset(Dataset):
    def __init__(self, x, y, transform=None):
        self.x = x
        self.y = y
        self.transform = transform
        
    def __getitem__(self, index):
        x = self.x[index]
        
        if self.transform:
            x = self.transform(x)
        
        return x, self.y[index]
        
    def __len__(self):
        return len(self.x)

Let's first create **temporary** `Dataset` to extract normalization parameters:

In [117]:
temp_dataset = TransformedTensorDataset(x_train_tensor, y_train_tensor)
temp_loader = DataLoader(temp_dataset, batch_size=32)
normalizer = StepByStep.make_normalizer(temp_loader)
normalizer

Normalize(mean=tensor([0.1960, 0.2124, 0.1632]), std=tensor([0.1087, 0.1089, 0.1102]))

## Real Datasets and Loaders

Let's now create **real** `Datasets` and `DataLoaders`:

In [118]:
train_composer = Compose([normalizer])  # train_composer composer will have augmentations later
val_composer = Compose([normalizer])

train_dataset = TransformedTensorDataset(x_train_tensor, y_train_tensor, transform=train_composer)
val_dataset = TransformedTensorDataset(x_val_tensor, y_val_tensor, transform=val_composer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)