In [1]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from matplotlib import pyplot as plt
from transformers import AdamW
import torch
from torch import nn
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
import torchvision
import os
from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor ,SegformerFeatureExtractor,SegformerConfig
import pandas as pd
import cv2
import albumentations as aug
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torchinfo import summary
import numpy as np
from datasets import load_metric
import evaluate
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import BasePredictionWriter

In [2]:
#os.environ['CUDA_LAUNCH_BLOCKING'] = "0"
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.cuda.empty_cache()
torch.set_float32_matmul_precision("high")
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
#torch.cuda.memory_summary(device=device, abbreviated=False)

In [4]:
class Model(torch.nn.Module):
    def __init__(self,id2label,model_name,label2id,num_classes):
        super(Model,self).__init__()
        self.id2label=id2label
        self.model_name=model_name
        self.label2id=label2id
        self.num_classes=num_classes
        self.model = SegformerForSemanticSegmentation.from_pretrained(
            self.model_name,
            ignore_mismatched_sizes=True,
            num_labels=self.num_classes,
            id2label=self.id2label,
            label2id=self.label2id,
            reshape_last_stage=True)
        self.model.config.num_labels=self.num_classes
        for para in self.model.parameters():
            para.requires_grad=True
    def forward(self,idx,mask):
        output=self.model(idx,mask)
        return output

In [5]:
print(torch.cuda.device_count())

1


In [6]:
class ImageSegmentationDataset(Dataset):
    """Image segmentation dataset."""
    def __init__(self, root_dir, feature_extractor,id2label, transforms=None, train=True):
        super(ImageSegmentationDataset,self).__init__()
        self.root_dir = root_dir
        self.id2label =  id2label
        self.feature_extractor = feature_extractor
        self.train = train
        self.transforms = transforms
        self.img_dir = os.path.join(self.root_dir, "images")
        self.ann_dir = os.path.join(self.root_dir, "pngmasks")
        image_file_names = []
        for root, dirs, files in os.walk(self.img_dir):
            image_file_names.extend(files)
        self.images = sorted(image_file_names)
        # read annotations
        annotation_file_names = []
        for root, dirs, files in os.walk(self.ann_dir):
            annotation_file_names.extend(files)
        self.annotations = sorted(annotation_file_names)
        assert len(self.images) == len(self.annotations) or len(self.images)==0 ,  "There must be as many images as there are segmentation maps"
    def __len__(self):
        return len(self.images)
    def __getitem__(self, idx):
        image = cv2.imread(os.path.join(self.img_dir, self.images[idx]))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        segmentation_map = cv2.imread(os.path.join(self.ann_dir, self.annotations[idx]))
        segmentation_map = cv2.cvtColor(segmentation_map, cv2.COLOR_BGR2GRAY)
        # randomly crop + pad both image and segmentation map to same size
        encoded_inputs = self.feature_extractor(image, segmentation_map, return_tensors="pt")
        for k,v in encoded_inputs.items():
          encoded_inputs[k].squeeze_() # remove batch dimension
        return encoded_inputs

In [7]:
class ImageSegmentationDatasetInfernce(Dataset):
    """Image segmentation dataset."""
    def __init__(self, image_dir,feature_extractor):
        super(ImageSegmentationDatasetInfernce,self).__init__()
        self.img_dir = image_dir
        self.feature_extractor=feature_extractor
        image_file_names = []
        for root, dirs, files in os.walk(self.img_dir):
            image_file_names.extend(files)
        self.images = sorted(image_file_names)
    def __len__(self):
        return len(self.images)
    def __getitem__(self, idx):
        image = cv2.imread(os.path.join(self.img_dir, self.images[idx]))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        encoded_inputs = self.feature_extractor(image, return_tensors="pt")
        for k,v in encoded_inputs.items():
          encoded_inputs[k].squeeze_() # remove batch dimension
        return encoded_inputs

In [8]:
class SegformerFinetuner(pl.LightningModule):
    def __init__(self, id2label,model_name, train_dataloader=None, val_dataloader=None, test_dataloader=None, metrics_interval=100):
        super(SegformerFinetuner, self).__init__()
        self.id2label = id2label
        self.metrics_interval = metrics_interval
        self.train_dl = train_dataloader
        self.val_dl = val_dataloader
        self.test_dl = test_dataloader
        self.num_classes = len(id2label.keys())
        self.label2id = {v: k for k, v in self.id2label.items()}
        self.model_name=model_name
        self.model =SegformerForSemanticSegmentation.from_pretrained(self.model_name,
                                                                    ignore_mismatched_sizes=True,
                                                   reshape_last_stage=True)
        
        self.model_torch_class=Model(id2label=self.id2label,
                         model_name=self.model_name,
                         label2id=self.label2id,
                         num_classes=self.num_classes)
        self.train_mean_iou = evaluate.load("mean_iou")
        self.val_mean_iou = evaluate.load("mean_iou")
        self.test_mean_iou = evaluate.load("mean_iou")
    def forward(self, images, masks=None):
        outputs = self.model(images,masks)
        return outputs
    def training_step(self, batch, batch_nb):
        images, masks = batch['pixel_values'], batch['labels']
        outputs = self(images, masks)
        loss, logits = outputs[0], outputs[1]
        upsampled_logits = nn.functional.interpolate(logits, size=masks.shape[-2:], mode="nearest-exact")
        predicted = upsampled_logits.argmax(dim=1)
        self.train_mean_iou.add_batch(predictions=predicted.detach().cpu().numpy(),
                                      references=masks.detach().cpu().numpy())
        if batch_nb % self.metrics_interval == 0:
            metrics = self.train_mean_iou.compute(num_labels=self.num_classes, ignore_index=255, reduce_labels=False, )
            metrics = {'loss': loss, "mean_iou": metrics["mean_iou"], "mean_accuracy": metrics["mean_accuracy"]}
            for k, v in metrics.items():
                self.log(k, v,enable_graph=True,prog_bar=True)
            self.log_predictions_to_tensorboard(images, masks, predicted, 'train')
            return metrics
        else:
            return {'loss': loss}

    def validation_step(self, batch, batch_nb):
        images, masks = batch['pixel_values'], batch['labels']
        outputs = self(images, masks)
        loss, logits = outputs[0], outputs[1]
        upsampled_logits = nn.functional.interpolate(logits, size=masks.shape[-2:], mode="nearest-exact")
        predicted = upsampled_logits.argmax(dim=1)
        self.val_mean_iou.add_batch(predictions=predicted.detach().cpu().numpy(),
                                    references=masks.detach().cpu().numpy())
        val_metrics = self.val_mean_iou.compute(num_labels=self.num_classes, ignore_index=255, reduce_labels=False)
        val_metrics = {'val_loss': loss, "val_mean_iou": val_metrics["mean_iou"],
                       "val_mean_accuracy": val_metrics["mean_accuracy"]}

        for k, v in val_metrics.items():
            self.log(k, v,enable_graph=True,prog_bar=True)
        self.log_predictions_to_tensorboard(images, masks, predicted, 'val')
        return val_metrics

    def test_step(self, batch, batch_nb):
        images, masks = batch['pixel_values'], batch['labels']
        outputs = self(images, masks)
        loss, logits = outputs[0], outputs[1]
        upsampled_logits = nn.functional.interpolate(logits, size=masks.shape[-2:], mode="nearest-exact")
        predicted = upsampled_logits.argmax(dim=1)
        self.test_mean_iou.add_batch(predictions=predicted.detach().cpu().numpy(),
                                                    references=masks.detach().cpu().numpy())
        test_metircs=self.test_mean_iou.compute(num_labels=self.num_classes,ignore_index=255,reduce_labels=False)
        test_metircs = {'test_loss': loss, "test_mean_iou": test_metircs["mean_iou"],
                        "test_mean_accuracy": test_metircs["mean_accuracy"]}
        for k, v in test_metircs.items():
            self.log(k, v,enable_graph=True,prog_bar=True)
        self.log_predictions_to_tensorboard(images, masks, predicted, 'test')
        return test_metircs     
    def configure_optimizers(self):
        return AdamW([p for p in self.parameters() if p.requires_grad], lr=2e-06, eps=1e-08)
    def train_dataloader(self):
        return self.train_dl
    def val_dataloader(self):
        return self.val_dl
    def test_dataloader(self):
        return self.test_dl
    def log_predictions_to_tensorboard(self, images, masks, predictions, mode='train'):
        img_grid = torchvision.utils.make_grid(images)
        mask_grid = torchvision.utils.make_grid(masks.unsqueeze(1))  # Assuming masks are single-channel
        pred_grid = torchvision.utils.make_grid(predictions.unsqueeze(1))  # Assuming predictions are single-channel
        self.logger.experiment.add_image(f'{mode}_images', img_grid, self.current_epoch)
        self.logger.experiment.add_image(f'{mode}_masks', mask_grid, self.current_epoch)
        self.logger.experiment.add_image(f'{mode}_predictions', pred_grid, self.current_epoch)

In [9]:
classes = ["background","stone"]
print(classes)
id2label = {0: classes[0],1:classes[1]}
print(id2label)
label2id = {v: k for k, v in id2label.items()}
print(label2id)

['background', 'stone']
{0: 'background', 1: 'stone'}
{'background': 0, 'stone': 1}


In [10]:
train_dir = "/home/cplus/projects/m.tarek_master/gravel_2D/graval_detection_project/datasets/under_water_masks_dataset/train"
valid_dir = "/home/cplus/projects/m.tarek_master/gravel_2D/graval_detection_project/datasets/under_water_masks_dataset/val"
test_dir = "/home/cplus/projects/m.tarek_master/gravel_2D/graval_detection_project/datasets/under_water_masks_dataset/test"
inference_dir="/home/cplus/projects/m.tarek_master/graval_detection_project/115351AA.mp4_"
model_name="nvidia/mit-b1"
feature_extractor = SegformerImageProcessor.from_pretrained(model_name)
feature_extractor.do_reduce_labels = False
feature_extractor.size = 1080

train_dataset = ImageSegmentationDataset(root_dir=train_dir, feature_extractor=feature_extractor,id2label=id2label, transforms=None, train=True)
valid_dataset = ImageSegmentationDataset(root_dir=valid_dir,feature_extractor=feature_extractor,id2label=id2label)
test_dataset = ImageSegmentationDataset(root_dir=test_dir,feature_extractor=feature_extractor,id2label=id2label)
inference_dataset=ImageSegmentationDatasetInfernce(image_dir=inference_dir,feature_extractor=feature_extractor)



In [11]:
print(feature_extractor)

SegformerImageProcessor {
  "_valid_processor_keys": [
    "images",
    "segmentation_maps",
    "do_resize",
    "size",
    "resample",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "do_reduce_labels",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "do_normalize": true,
  "do_reduce_labels": false,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "SegformerImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": 1080
}



In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=0)
valid_dataloader = DataLoader(valid_dataset, batch_size=1,shuffle=False,num_workers=0)
test_dataloader  = DataLoader(test_dataset,batch_size=1,shuffle=False,num_workers=0)
#inference_dataloader=DataLoader(inference_dataset,batch_size=2,shuffle=False,num_workers=0)

In [13]:
logger = TensorBoardLogger(save_dir="logs", name="segformer_logs_b1")

In [15]:
SegformerFineTuner=SegformerFinetuner(id2label=id2label,train_dataloader=train_dataloader,val_dataloader=valid_dataloader,
                                    test_dataloader=test_dataloader,metrics_interval=10,model_name=model_name)
SegformerFineTuner.to(device=device)

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b1 and are newly initialized: ['decode_head.batch_norm.bias', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.running_mean', 'decode_head.batch_norm.running_var', 'decode_head.batch_norm.weight', 'decode_head.classifier.bias', 'decode_head.classifier.weight', 'decode_head.linear_c.0.proj.bias', 'decode_head.linear_c.0.proj.weight', 'decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_c.2.proj.bias', 'decode_head.linear_c.2.proj.weight', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.3.proj.weight', 'decode_head.linear_fuse.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b1 and are newly initialized: ['decode_head.batch_norm.bi

SegformerFinetuner(
  (model): SegformerForSemanticSegmentation(
    (segformer): SegformerModel(
      (encoder): SegformerEncoder(
        (patch_embeddings): ModuleList(
          (0): SegformerOverlapPatchEmbeddings(
            (proj): Conv2d(3, 64, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
            (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          )
          (1): SegformerOverlapPatchEmbeddings(
            (proj): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
            (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          )
          (2): SegformerOverlapPatchEmbeddings(
            (proj): Conv2d(128, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
            (layer_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          )
          (3): SegformerOverlapPatchEmbeddings(
            (proj): Conv2d(320, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
            

In [16]:
summary(model=SegformerFineTuner,device=SegformerFineTuner.device)

Layer (type:depth-idx)                                                                Param #
SegformerFinetuner                                                                    --
├─SegformerForSemanticSegmentation: 1-1                                               --
│    └─SegformerModel: 2-1                                                            --
│    │    └─SegformerEncoder: 3-1                                                     13,151,424
│    └─SegformerDecodeHead: 2-2                                                       --
│    │    └─ModuleList: 3-2                                                           263,168
│    │    └─Conv2d: 3-3                                                               262,144
│    │    └─BatchNorm2d: 3-4                                                          512
│    │    └─ReLU: 3-5                                                                 --
│    │    └─Dropout: 3-6                                                              

In [17]:
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=100, verbose=False, mode="min")
checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_loss")

In [18]:
trainer = pl.Trainer(max_epochs=1, val_check_interval=len(valid_dataloader), accelerator="gpu", devices=1,
                     callbacks=[early_stop_callback, checkpoint_callback], logger=logger,enable_progress_bar=True,fast_dev_run=False)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model=SegformerFineTuner,val_dataloaders=valid_dataloader,train_dataloaders=train_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type                             | Params
-----------------------------------------------------------------------
0 | model             | SegformerForSemanticSegmentation | 13.9 M
1 | model_torch_class | Model                            | 13.7 M
-----------------------------------------------------------------------
27.6 M    Trainable params
0         Non-trainable params
27.6 M    Total params
110.448   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

/home/cplus/projects/m.tarek_master/gravel_2D/graval_detection_project/gravel_env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
  iou = total_area_intersect / total_area_union
  acc = total_area_intersect / total_area_label
/home/cplus/projects/m.tarek_master/gravel_2D/graval_detection_project/gravel_env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |                                                                                                   …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

In [None]:
%load_ext tensorboard 
%tensorboard --logdir=lightning_logs/

In [None]:
trainer.validate(model=SegformerFineTuner,
                 dataloaders=valid_dataloader,
                 ckpt_path="/home/cplus/projects/m.tarek_master/graval_detection_project/transformers_for_CV/logs/segformer_logs_b1/version_0/checkpoints/epoch=98-step=56455.ckpt")

In [None]:
trainer.test(model=SegformerFineTuner,
             ckpt_path="/home/cplus/projects/m.tarek_master/graval_detection_project/transformers_for_CV/logs/segformer_logs_b1/version_0/checkpoints/epoch=98-step=56455.ckpt",
             dataloaders=test_dataloader)

In [None]:
trainer.predict(ckpt_path="/home/cplus/projects/m.tarek_master/graval_detection_project/transformers_for_CV/logs/segformer_logs_b1/version_0/checkpoints/epoch=98-step=56455.ckpt",
               model=SegformerFineTuner,
               dataloaders=inference_dataloader)

In [None]:
upsampled_logits = nn.functional.interpolate(logits, size=(1080,1080), mode="nearest-exact")
predicted = upsampled_logits.argmax(dim=0)
predicted=predicted.detach().cpu()

In [None]:
print(predicted.shape)

In [None]:
print(predicted.unique())

In [None]:
torch.save(obj=SegformerFineTuner.model,f="test2922024_2.pt")

In [None]:
image = Image.open('/home/cplus/projects/m.tarek_master/graval_detection_project/115351AA.mp4_/11_left.jpg')
image

In [None]:
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values.to("cpu")
print(pixel_values.shape)
print(pixel_values.device)

In [None]:
def ade_palette():
    """ADE20K palette that maps each class to RGB values."""
    return [[0,0,0],[255, 255, 255]]

In [None]:
with torch.no_grad():
    outputs = SegformerFineTuner.model(pixel_values=pixel_values)

logits = outputs.logits.cpu()
    

In [None]:
print(outputs)

In [None]:
print(logits)

In [None]:
predicted_segmentation_map = feature_extractor.post_process_semantic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

color_seg = np.zeros((predicted_segmentation_map.shape[0],
                      predicted_segmentation_map.shape[1], 3), dtype=np.uint8) # height, width, 3

palette = np.array(ade_palette())
for label, color in enumerate(palette):
    color_seg[predicted_segmentation_map == label, :] = color
 #Convert to BGR
color_seg = color_seg[..., ::-1]

# Show image + mask
img = np.array(image) * 0.5 + color_seg * 0.5
img = img.astype(np.uint8)

#plt.figure(figsize=(15, 10))
plt.imshow(predicted_segmentation_map)
plt.show()

In [None]:
"""with torch.no_grad():
    for image in inference_dataloader:
        outputs = SegformerFineTuner.model(pixel_values=image)
        
        print(predicted_segmentation_map)"""
        

In [None]:
"""with torch.no_grad():
    for idx,image in enumerate(inference_dataloader):
      outputs = SegformerFineTuner.model(image)"""

In [None]:
color_map = {
    0:(0,0,0),
    1:(0,255,0),
}

def prediction_to_vis(prediction):
    vis_shape = prediction.shape + (3,)
    vis = np.zeros(vis_shape)
    for i,c in color_map.items():
        vis[prediction == i] = color_map[i]
    return Image.fromarray(vis.astype(np.uint8))

for batch in inference_dataloader:
    images, masks = batch['pixel_values']
    outputs = SegformerFineTuner.model(images)
    loss, logits = outputs[0], outputs[1]
    upsampled_logits = nn.functional.interpolate(logits,size=masks.shape[-2:],mode="bilinear",align_corners=False)
    predicted_mask = upsampled_logits.argmax(dim=1).cpu().numpy()
    masks = masks.cpu().numpy()
n_plots = 4

f, axarr = plt.subplots(n_plots,2)
f.set_figheight(15)
f.set_figwidth(15)
for i in range(n_plots):
    axarr[i,0].imshow(prediction_to_vis(predicted_mask[i,:,:]))
    axarr[i,1].imshow(prediction_to_vis(masks[i,:,:]))
    break


In [None]:
color_map = {
    0: (0, 0, 0),
    1: (255, 255, 255),
}

def prediction_to_vis(prediction):
    vis_shape = prediction.shape + (3,)
    vis = np.zeros(vis_shape, dtype=np.uint8)
    for i, c in color_map.items():
        vis[prediction == i] = color_map[i]
    return Image.fromarray(vis)

save_dir = "predicted_masks"
os.makedirs(save_dir, exist_ok=True)

for batch_idx, batch in enumerate(test_dataloader):
    images, masks = batch['pixel_values'], batch['labels']
    outputs = SegformerFineTuner.model(images, masks)
        
    loss, logits = outputs[0], outputs[1]

    upsampled_logits = nn.functional.interpolate(logits, size=masks.shape[-2:], mode="bilinear", align_corners=False)

    predicted = upsampled_logits.argmax(dim=1).cpu().numpy()
    masks = masks.cpu().numpy()
    print(masks.shape)

    """for i in range(predicted.shape[0]):
        predicted_mask = prediction_to_vis(predicted[i, :, :])
        predicted_mask_path = os.path.join(save_dir, f"batch_{batch_idx}_image_{i}_predicted_mask.png")
        predicted_mask.save(predicted_mask_path)
        ground_truth_mask = prediction_to_vis(masks[i, :, :])
        ground_truth_mask_path = os.path.join(save_dir, f"batch_{batch_idx}_image_{i}_ground_truth_mask.png")
        ground_truth_mask.save(ground_truth_mask_path)
        plt.figure(figsize=(8, 4))
        plt.subplot(1, 2, 1)
        plt.imshow(predicted_mask)
        plt.title('Predicted')
        plt.subplot(1, 2, 2)
        plt.imshow(ground_truth_mask)
        plt.title('Ground Truth')
        plt.show()"""