In [None]:
# === colab configuration ===
# p.s. when training on colab, weights are saved on Drive (directory DSCAS/weights).
# p.p.s. skip this cell if running demo file locally!

! pip install torch-summary
! pip install ray
! pip install tensorboardX==2.5.1
! pip install tensorboard==2.11.2
from google.colab import drive, files
import sys

# setting paths
repository_path = '/content/deep-seasonal-color-analysis-system/'
dataset_path = repository_path + 'headsegmentation_dataset_ccncsa/'
dataset_path_drive = '/content/drive/MyDrive/DSCAS/headsegmentation_dataset_ccncsa/'
weights_path = repository_path + 'models/weights/'
weights_path_drive = '/content/drive/MyDrive/DSCAS/weights/'
checkpoint_path = repository_path + 'models/preprocessing/'
checkpoint_archive = '/content/' + checkpoint_path.split('/')[-2] + '.zip'

sys.path.insert(0, repository_path)

# cloning project repository and downloading dataset
drive.mount('/content/drive')
! test ! -d $repository_path && git clone https://github.com/mrcmich/deep-seasonal-color-analysis-system.git
! test ! -d $dataset_path && cp -R $dataset_path_drive $dataset_path
%cd $repository_path

# setting branch and pulling updates
branch = 'main'
! git checkout $branch
! git pull origin $branch

executing_on_colab = True

In [2]:
import os
import torch
from torch import nn, optim
import torchvision.transforms as T
from sklearn.model_selection import train_test_split
from models import dataset, training_and_testing
from models.cloud.UNet import unet
from metrics_and_losses import metrics
from utils import segmentation_labels, custom_transforms, model_names
from models import config
from slurm_scripts import slurm_config
from ray import tune
from ray.tune import CLIReporter
from functools import partial

try:
  executing_on_colab
except NameError:
  executing_on_colab = False
  weights_path = config.WEIGHTS_PATH
  dataset_path = config.DATASET_PATH
  checkpoint_path = config.PREPROCESSING_PATH

In [3]:
# === defining transforms ===

diameter = 7
sigma_color = 50
sigma_space = 100
horizontal_flip = custom_transforms.PartiallyDeterministicHorizontalFlip(p=0.5)
center_crop = custom_transforms.PartiallyDeterministicCenterCrop(p=0.5)

image_transform = T.Compose([
    T.Resize(slurm_config.GLOBAL_INPUT_SIZE_TRAINING_DEMO), 
    custom_transforms.BilateralFilter(sigma_color, sigma_space, diameter), 
    T.Normalize(config.NORMALIZE_MEAN, config.NORMALIZE_STD)])

target_transform = T.Compose([T.Resize(slurm_config.GLOBAL_INPUT_SIZE_TRAINING_DEMO)])

# fetching dataset
n_classes = len(segmentation_labels.labels)
img_paths, label_paths = dataset.get_paths(dataset_path, file_name=config.DATASET_INDEX_NAME)
X_train, _, Y_train, _ = train_test_split(img_paths, label_paths, test_size=0.20, random_state=99, shuffle=True)
train_dataset = dataset.CcncsaDataset(X_train, Y_train, image_transform, target_transform)

# model parameters
model_name = "unet"
model = unet.UNet(out_channels=n_classes)
optimizer = optim.Adam

# setting up hyperparameters for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_epochs = 10
batch_size = 32
loss_fn = nn.CrossEntropyLoss()
score_fn = metrics.batch_mIoU
learning_rate = 0.01
class_weights = torch.tensor(config.CLASS_WEIGHTS, device=device)

# Ray Tune parameters
cpus_per_trial = 0
gpus_per_trial = torch.cuda.device_count()
num_samples = 1  # Number of times each combination is sampled (n_epochs are done per sample)
metric = "val_loss"
metrics_columns = ["train_loss", "train_score", "val_loss", "val_score", "training_iteration"]
local_dir = checkpoint_path + model_names.MODEL_NAMES[model_name]
max_report_frequency = 600
reporter = CLIReporter(
    metric_columns=metrics_columns, max_report_frequency=max_report_frequency)

In [None]:
# === training without augmentation ===

checkpoint_dir = local_dir if executing_on_colab else (os.path.abspath('./' + local_dir) + '/')
cfg = {
    "lr": learning_rate,
    "lr_scheduler": "none",
    "batch_size": batch_size,
    "transform": "none",
    "from_checkpoint": False,
    "checkpoint_dir": checkpoint_dir
    }

tune.run(partial(training_and_testing.train_model,
                 device=device, model=model, dataset=train_dataset, n_epochs=n_epochs,
                 score_fn=score_fn, loss_fn=loss_fn, optimizer=optimizer, num_workers=(0, 0),
                 evaluate=True, class_weights=class_weights),
         config=cfg,
         num_samples=num_samples,
         resources_per_trial={"cpu": cpus_per_trial, "gpu": gpus_per_trial},
         progress_reporter=reporter,
         checkpoint_at_end=True,
         checkpoint_freq=1,
         local_dir=local_dir)

2023-02-03 07:40:29,781	INFO worker.py:1538 -- Started a local Ray instance.


== Status ==
Current time: 2023-02-03 07:40:34 (running for 00:00:01.29)
Memory usage on this node: 4.0/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 1.0/1 GPUs, 0.0/7.39 GiB heap, 0.0/3.69 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_07-40-32
Number of trials: 1/1 (1 RUNNING)
+-------------------------+----------+-------------------+
| Trial name              | status   | loc               |
|-------------------------+----------+-------------------|
| train_model_0a031_00000 | RUNNING  | 172.28.0.12:68484 |
+-------------------------+----------+-------------------+




Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train_loss,train_score,training_iteration,trial_id,val_loss,val_score,warmup_time
train_model_0a031_00000,2023-02-03_08-37-07,True,,85bf0c216daa4f248d6bda6d30aaeb2d,0,2a2887f72dc2,10,172.28.0.12,68484,3389.39,336.489,3389.39,1675413427,0,,0.150644,0.57162,10,0a031_00000,0.160575,0.59436,0.00316453


== Status ==
Current time: 2023-02-03 07:50:36 (running for 00:10:03.23)
Memory usage on this node: 6.8/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 1.0/1 GPUs, 0.0/7.39 GiB heap, 0.0/3.69 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_07-40-32
Number of trials: 1/1 (1 RUNNING)
+-------------------------+----------+-------------------+--------------+---------------+------------+-------------+----------------------+
| Trial name              | status   | loc               |   train_loss |   train_score |   val_loss |   val_score |   training_iteration |
|-------------------------+----------+-------------------+--------------+---------------+------------+-------------+----------------------|
| train_model_0a031_00000 | RUNNING  | 172.28.0.12:68484 |      0.71635 |      0.334862 |   0.694573 |    0.321099 |                    1 |
+-------------------------+----------+-------------------+

2023-02-03 08:37:07,548	INFO tune.py:762 -- Total run time: 3396.06 seconds (3394.34 seconds for the tuning loop).


== Status ==
Current time: 2023-02-03 08:37:07 (running for 00:56:34.34)
Memory usage on this node: 6.8/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.39 GiB heap, 0.0/3.69 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_07-40-32
Number of trials: 1/1 (1 TERMINATED)
+-------------------------+------------+-------------------+--------------+---------------+------------+-------------+----------------------+
| Trial name              | status     | loc               |   train_loss |   train_score |   val_loss |   val_score |   training_iteration |
|-------------------------+------------+-------------------+--------------+---------------+------------+-------------+----------------------|
| train_model_0a031_00000 | TERMINATED | 172.28.0.12:68484 |     0.150644 |       0.57162 |   0.160575 |     0.59436 |                   10 |
+-------------------------+------------+---------

<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f6d5a7f3c40>

In [None]:
# downloading updated training results if running on colab
! if test $executing_on_colab = 'True' ; then zip -r $checkpoint_archive $checkpoint_path ; fi
if executing_on_colab:
    files.download(checkpoint_archive)

In [None]:
# === training with augmentation: ColorJitter ===

image_transform_with_augmentation = T.Compose([
    T.ColorJitter(brightness=0.25, contrast=0.25), 
    T.Resize(slurm_config.GLOBAL_INPUT_SIZE_TRAINING_DEMO), 
    custom_transforms.BilateralFilter(sigma_color, sigma_space, diameter), 
    T.Normalize(config.NORMALIZE_MEAN, config.NORMALIZE_STD)])

train_dataset_with_augmentation = dataset.CcncsaDataset(
    X_train, Y_train, image_transform_with_augmentation, target_transform)
    
model = unet.UNet(out_channels=n_classes)

cfg["transform"] = "color_jitter"

tune.run(partial(training_and_testing.train_model,
                 device=device, model=model, dataset=train_dataset_with_augmentation, n_epochs=n_epochs,
                 score_fn=score_fn, loss_fn=loss_fn, optimizer=optimizer, num_workers=(0, 0),
                 evaluate=True, class_weights=class_weights),
         config=cfg,
         num_samples=num_samples,
         resources_per_trial={"cpu": cpus_per_trial, "gpu": gpus_per_trial},
         progress_reporter=reporter,
         checkpoint_at_end=True,
         checkpoint_freq=1,
         local_dir=local_dir)



== Status ==
Current time: 2023-02-03 08:47:44 (running for 00:00:01.18)
Memory usage on this node: 4.0/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 1.0/1 GPUs, 0.0/7.39 GiB heap, 0.0/3.69 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_08-47-42
Number of trials: 1/1 (1 RUNNING)
+-------------------------+----------+-------------------+
| Trial name              | status   | loc               |
|-------------------------+----------+-------------------|
| train_model_6c3ff_00000 | RUNNING  | 172.28.0.12:86027 |
+-------------------------+----------+-------------------+




Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train_loss,train_score,training_iteration,trial_id,val_loss,val_score,warmup_time
train_model_6c3ff_00000,2023-02-03_09-47-21,True,,b3027006f877478f8e27ad70a1b63d52,0,2a2887f72dc2,10,172.28.0.12,86027,3572.55,355.919,3572.55,1675417641,0,,0.143176,0.584405,10,6c3ff_00000,0.162529,0.588042,0.00328732


== Status ==
Current time: 2023-02-03 08:57:47 (running for 00:10:04.64)
Memory usage on this node: 7.0/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 1.0/1 GPUs, 0.0/7.39 GiB heap, 0.0/3.69 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_08-47-42
Number of trials: 1/1 (1 RUNNING)
+-------------------------+----------+-------------------+--------------+---------------+------------+-------------+----------------------+
| Trial name              | status   | loc               |   train_loss |   train_score |   val_loss |   val_score |   training_iteration |
|-------------------------+----------+-------------------+--------------+---------------+------------+-------------+----------------------|
| train_model_6c3ff_00000 | RUNNING  | 172.28.0.12:86027 |     0.741406 |      0.326032 |   0.760862 |    0.317335 |                    1 |
+-------------------------+----------+-------------------+

2023-02-03 09:47:21,599	INFO tune.py:762 -- Total run time: 3579.24 seconds (3578.46 seconds for the tuning loop).


== Status ==
Current time: 2023-02-03 09:47:21 (running for 00:59:38.47)
Memory usage on this node: 7.0/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.39 GiB heap, 0.0/3.69 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_08-47-42
Number of trials: 1/1 (1 TERMINATED)
+-------------------------+------------+-------------------+--------------+---------------+------------+-------------+----------------------+
| Trial name              | status     | loc               |   train_loss |   train_score |   val_loss |   val_score |   training_iteration |
|-------------------------+------------+-------------------+--------------+---------------+------------+-------------+----------------------|
| train_model_6c3ff_00000 | TERMINATED | 172.28.0.12:86027 |     0.143176 |      0.584405 |   0.162529 |    0.588042 |                   10 |
+-------------------------+------------+---------

<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f6d5769c7c0>

In [None]:
# downloading updated training results if running on colab
! if test $executing_on_colab = 'True' ; then zip -r $checkpoint_archive $checkpoint_path ; fi
if executing_on_colab:
    files.download(checkpoint_archive)

In [5]:
# === training with augmentation: horizontal_flip ===

image_transform_with_augmentation = T.Compose([
    horizontal_flip,
    T.Resize(slurm_config.GLOBAL_INPUT_SIZE_TRAINING_DEMO), 
    custom_transforms.BilateralFilter(sigma_color, sigma_space, diameter), 
    T.Normalize(config.NORMALIZE_MEAN, config.NORMALIZE_STD)])

target_transform_with_augmentation = T.Compose([
    horizontal_flip,
    T.Resize(slurm_config.GLOBAL_INPUT_SIZE_TRAINING_DEMO)])

train_dataset_with_augmentation = dataset.CcncsaDataset(
    X_train, Y_train, image_transform_with_augmentation, target_transform_with_augmentation)
    
model = unet.UNet(out_channels=n_classes)

cfg["transform"] = "horizontal_flip"

tune.run(partial(training_and_testing.train_model,
                 device=device, model=model, dataset=train_dataset_with_augmentation, n_epochs=n_epochs,
                 score_fn=score_fn, loss_fn=loss_fn, optimizer=optimizer, num_workers=(0, 0),
                 evaluate=True, class_weights=class_weights),
         config=cfg,
         num_samples=num_samples,
         resources_per_trial={"cpu": cpus_per_trial, "gpu": gpus_per_trial},
         progress_reporter=reporter,
         checkpoint_at_end=True,
         checkpoint_freq=1,
         local_dir=local_dir)

2023-02-03 12:41:59,815	INFO worker.py:1538 -- Started a local Ray instance.


== Status ==
Current time: 2023-02-03 12:42:03 (running for 00:00:01.33)
Memory usage on this node: 3.9/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 1.0/1 GPUs, 0.0/7.41 GiB heap, 0.0/3.71 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_12-42-01
Number of trials: 1/1 (1 RUNNING)
+-------------------------+----------+-------------------+
| Trial name              | status   | loc               |
|-------------------------+----------+-------------------|
| train_model_27e40_00000 | RUNNING  | 172.28.0.12:61278 |
+-------------------------+----------+-------------------+




Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train_loss,train_score,training_iteration,trial_id,val_loss,val_score,warmup_time
train_model_27e40_00000,2023-02-03_13-41-29,True,,4826087e234f4fc3827640cb7f61fed0,0,5fc79c42c9a4,10,172.28.0.12,61278,3562.98,351.353,3562.98,1675431689,0,,0.188617,0.535956,10,27e40_00000,0.201667,0.510318,0.0030849


== Status ==
Current time: 2023-02-03 12:52:04 (running for 00:10:02.15)
Memory usage on this node: 6.9/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 1.0/1 GPUs, 0.0/7.41 GiB heap, 0.0/3.71 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_12-42-01
Number of trials: 1/1 (1 RUNNING)
+-------------------------+----------+-------------------+--------------+---------------+------------+-------------+----------------------+
| Trial name              | status   | loc               |   train_loss |   train_score |   val_loss |   val_score |   training_iteration |
|-------------------------+----------+-------------------+--------------+---------------+------------+-------------+----------------------|
| train_model_27e40_00000 | RUNNING  | 172.28.0.12:61278 |     0.711323 |      0.338428 |   0.629251 |    0.350198 |                    1 |
+-------------------------+----------+-------------------+

2023-02-03 13:41:29,905	INFO tune.py:762 -- Total run time: 3569.03 seconds (3567.92 seconds for the tuning loop).


== Status ==
Current time: 2023-02-03 13:41:29 (running for 00:59:27.94)
Memory usage on this node: 6.7/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.41 GiB heap, 0.0/3.71 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_12-42-01
Number of trials: 1/1 (1 TERMINATED)
+-------------------------+------------+-------------------+--------------+---------------+------------+-------------+----------------------+
| Trial name              | status     | loc               |   train_loss |   train_score |   val_loss |   val_score |   training_iteration |
|-------------------------+------------+-------------------+--------------+---------------+------------+-------------+----------------------|
| train_model_27e40_00000 | TERMINATED | 172.28.0.12:61278 |     0.188617 |      0.535956 |   0.201667 |    0.510318 |                   10 |
+-------------------------+------------+---------

<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f047170a3a0>

In [None]:
# downloading updated training results if running on colab
! if test $executing_on_colab = 'True' ; then zip -r $checkpoint_archive $checkpoint_path ; fi
if executing_on_colab:
    files.download(checkpoint_archive)

In [7]:
# === training with augmentation: center_crop ===

image_transform_with_augmentation = T.Compose([
    center_crop,
    T.Resize(slurm_config.GLOBAL_INPUT_SIZE_TRAINING_DEMO), 
    custom_transforms.BilateralFilter(sigma_color, sigma_space, diameter), 
    T.Normalize(config.NORMALIZE_MEAN, config.NORMALIZE_STD)])

target_transform_with_augmentation = T.Compose([
    center_crop,
    T.Resize(slurm_config.GLOBAL_INPUT_SIZE_TRAINING_DEMO)])

train_dataset_with_augmentation = dataset.CcncsaDataset(
    X_train, Y_train, image_transform_with_augmentation, target_transform_with_augmentation)

model = unet.UNet(out_channels=n_classes)

cfg["transform"] = "center_crop"

tune.run(partial(training_and_testing.train_model,
                 device=device, model=model, dataset=train_dataset_with_augmentation, n_epochs=n_epochs,
                 score_fn=score_fn, loss_fn=loss_fn, optimizer=optimizer, num_workers=(0, 0),
                 evaluate=True, class_weights=class_weights),
         config=cfg,
         num_samples=num_samples,
         resources_per_trial={"cpu": cpus_per_trial, "gpu": gpus_per_trial},
         progress_reporter=reporter,
         checkpoint_at_end=True,
         checkpoint_freq=1,
         local_dir=local_dir)



== Status ==
Current time: 2023-02-03 13:49:21 (running for 00:00:01.16)
Memory usage on this node: 4.0/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 1.0/1 GPUs, 0.0/7.41 GiB heap, 0.0/3.71 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_13-49-19
Number of trials: 1/1 (1 RUNNING)
+-------------------------+----------+-------------------+
| Trial name              | status   | loc               |
|-------------------------+----------+-------------------|
| train_model_8ed32_00000 | RUNNING  | 172.28.0.12:79096 |
+-------------------------+----------+-------------------+




Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train_loss,train_score,training_iteration,trial_id,val_loss,val_score,warmup_time
train_model_8ed32_00000,2023-02-03_14-46-01,True,,c8d43b0a961b45f5bd18919d5b4cd70d,0,5fc79c42c9a4,10,172.28.0.12,79096,3395.82,339.044,3395.82,1675435561,0,,0.160402,0.580858,10,8ed32_00000,0.168652,0.594522,0.00411224


== Status ==
Current time: 2023-02-03 13:59:22 (running for 00:10:03.13)
Memory usage on this node: 7.0/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 1.0/1 GPUs, 0.0/7.41 GiB heap, 0.0/3.71 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_13-49-19
Number of trials: 1/1 (1 RUNNING)
+-------------------------+----------+-------------------+--------------+---------------+------------+-------------+----------------------+
| Trial name              | status   | loc               |   train_loss |   train_score |   val_loss |   val_score |   training_iteration |
|-------------------------+----------+-------------------+--------------+---------------+------------+-------------+----------------------|
| train_model_8ed32_00000 | RUNNING  | 172.28.0.12:79096 |     0.742509 |      0.332602 |   0.639785 |    0.342014 |                    1 |
+-------------------------+----------+-------------------+

2023-02-03 14:46:01,992	INFO tune.py:762 -- Total run time: 3402.78 seconds (3402.04 seconds for the tuning loop).


== Status ==
Current time: 2023-02-03 14:46:01 (running for 00:56:42.04)
Memory usage on this node: 7.0/12.7 GiB 
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.41 GiB heap, 0.0/3.71 GiB objects
Result logdir: /content/deep-seasonal-color-analysis-system/models/preprocessing/UNet/train_model_2023-02-03_13-49-19
Number of trials: 1/1 (1 TERMINATED)
+-------------------------+------------+-------------------+--------------+---------------+------------+-------------+----------------------+
| Trial name              | status     | loc               |   train_loss |   train_score |   val_loss |   val_score |   training_iteration |
|-------------------------+------------+-------------------+--------------+---------------+------------+-------------+----------------------|
| train_model_8ed32_00000 | TERMINATED | 172.28.0.12:79096 |     0.160402 |      0.580858 |   0.168652 |    0.594522 |                   10 |
+-------------------------+------------+---------

<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f0470c5e730>

In [None]:
# downloading updated training results if running on colab
! if test $executing_on_colab = 'True' ; then zip -r $checkpoint_archive $checkpoint_path ; fi
if executing_on_colab:
    files.download(checkpoint_archive)