In [9]:
%load_ext autoreload
%autoreload 2
import sys
# remove last element from path
import os
from os.path import abspath, join, exists
sys.path.pop()  # preexisting imports path messing up imports
sys.path.append(abspath(join('..')))  # ,'src'
print("\n".join(sys.path))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/petrmiculek/Code/kaggle/sentinel_clouds/notebooks
/usr/lib/python310.zip
/usr/lib/python3.10
/usr/lib/python3.10/lib-dynload

/home/petrmiculek/.local/share/virtualenvs/kaggle-DjzpaDb6/lib/python3.10/site-packages
/home/petrmiculek/Code/kaggle/sentinel_clouds


In [10]:
# standard library
from copy import deepcopy
from types import SimpleNamespace
# external
import numpy as np
import torch
from torch.nn import MSELoss, BCELoss, BCEWithLogitsLoss
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn.functional import sigmoid
from torch.cuda.amp import GradScaler
from torch import autocast
from tqdm.notebook import tqdm
from torchinfo import summary
# local
from src.data.dataset import CloudDataset, get_loaders
from src.models.unet import UNet
from src.models.util import EarlyStopping

In [11]:
# Hyperparameters
HP = SimpleNamespace()
''' Preprocessing '''
# -
''' Data '''
HP.workers = 0
HP.batch_size = 1
''' Model '''
# -
''' Training '''
HP.epochs = 5
# HP.warmup_prop = 0.1
HP.lr = 1e-3

In [12]:
path_data = '/mnt/sdb1/code/sentinel2/sample'
dataset_kwargs = {'tile_size': 224, 'crop_pad_mask': 'crop'}
loader_kwargs = {'batch_size': HP.batch_size, 'num_workers': HP.workers, 'pin_memory': True}
loader = get_loaders(path_data, **dataset_kwargs, **loader_kwargs)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = UNet(in_channels=4).to(device)
summary(model, input_size=(1, 4, 224, 224))
criterion = BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=HP.lr, weight_decay=1e-2)
scaler = GradScaler()  # mixed precision training (16-bit)
early_stopping = EarlyStopping(patience=5)
scheduler = ReduceLROnPlateau(optimizer, patience=3)
device

device(type='cuda')

In [14]:
def evaluate(model, loader):
    model.eval()
    loss = 0
    labels, preds = [], []
    with torch.no_grad():
        for s in tqdm(loader):
            x, y = s['image'].to(device), s['label'].to(device)
            pred = model.predict(x)
            loss += criterion(pred, y).item()
            labels.append(y.cpu().numpy())
            preds.append(pred.cpu().numpy())
    return {"loss": loss / len(loader),
            "labels": np.concatenate(labels),
            "preds": np.concatenate(preds)}

In [15]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, jaccard_score
def compute_metrics(labels, preds):
    labels, preds = labels.flatten(), preds.flatten()
    preds = preds > 0.5
    return {'Accuracy': accuracy_score(labels, preds),
            'Precision': precision_score(labels, preds),
            'Recall': recall_score(labels, preds),
            'F1': f1_score(labels, preds),
            'IoU': jaccard_score(labels, preds)}

In [20]:
''' Training '''
best_accu_val = 0
best_res = None
epochs_trained = 0
stop_training = False
grad_acc_steps = 1
for epoch in range(epochs_trained, epochs_trained + HP.epochs):
    model.train()
    ep_train_loss = 0
    preds_train = []
    labels_train = []
    try:
        progress_bar = tqdm(loader['train'], mininterval=1., desc=f'ep{epoch} train')
        for i, sample in enumerate(progress_bar, start=1):
            img, label = sample['image'].to(device, non_blocking=True), sample['label'].to(device, non_blocking=True)
            # forward pass
            with autocast(device_type='cuda', dtype=torch.float16):
                logits = model(img)  # prediction
                loss = criterion(logits, label)
            # backward pass
            scaler.scale(loss).backward()
            if i % grad_acc_steps == 0:  # gradient step with accumulated gradients
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)
            with torch.no_grad():  # save predictions
                ep_train_loss += loss.cpu().numpy()
                pred = sigmoid(logits)
                labels_train.append(label.cpu().numpy())
                preds_train.append(pred.cpu().numpy())

            progress_bar.set_postfix(loss=f'{loss:.4f}', refresh=False)
        # end of training epoch loop
    except KeyboardInterrupt:
        print(f'Ctrl+C stopped training')
        stop_training = True

    ep_train_loss /= len(loader['train'])
    # compute training metrics
    preds_train, labels_train = np.concatenate(preds_train), np.concatenate(labels_train)
    metrics_train = compute_metrics(labels_train, preds_train)

    ''' Validation loop '''
    model.eval()
    results_val = evaluate(model, loader['val'])
    metrics_val = compute_metrics(results_val['labels'], results_val['preds'])
    # log results
    res_epoch = {'Loss Training': ep_train_loss, 'Loss Validation': results_val['loss'],
                 'Accuracy Training': metrics_train['Accuracy'], 'Accuracy Validation': metrics_val['Accuracy']}
    # print_dict(res_epoch)
    if metrics_val['Accuracy'] >= best_accu_val:  # save best results
        best_accu_val = metrics_val['Accuracy']
        # save a deepcopy of res to best_res
        best_res = deepcopy(res_epoch)
    if False:
        wb.log(res_epoch, step=epoch)
    epochs_trained += 1
    scheduler.step(results_val['loss'])  # LR scheduler
    early_stopping(results_val['loss'], model)  # model checkpointing
    if early_stopping.early_stop or stop_training:
        print('Early stopping')
        break

ep0 train:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

ep1 train:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 5


ep2 train:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

EarlyStopping counter: 2 out of 5


ep3 train:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

EarlyStopping counter: 3 out of 5


ep4 train:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

EarlyStopping counter: 4 out of 5


In [None]:
if False:
    # save model as .pt
    torch.save(model.state_dict(), 'model.pt')
    # save model as onnx
    dummy_input = torch.randn(1, 4, 224, 224, device='cuda')
    torch.onnx.export(model, dummy_input, "model.onnx", verbose=True, opset_version=11, input_names=['input'], output_names=['output'])