Segmentation Task에서 가장 기초적이고 대표적인 'Unet' 구조를 활용하여 구현

RLE 인코딩과 RLE 디코딩에 대한 코드 포함

DeepLab v3+로 변경

In [None]:
import os

from io import BytesIO
from matplotlib import gridspec
from matplotlib import pyplot as plt
from PIL import Image
import torch.nn.functional as F
import tensorflow as tf

import torchvision.models as models

import cv2
import pandas as pd
import numpy as np
from typing import List, Union
from joblib import Parallel, delayed

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# **Utils**

In [None]:
# RLE 디코딩 함수
def rle_decode(mask_rle: Union[str, int], shape=(224, 224)) -> np.array:
    '''
    mask_rle: run-length as string formatted (start length)
    shape: (height,width) of array to return
    Returns numpy array, 1 - mask, 0 - background
    '''
    if mask_rle == -1:
        return np.zeros(shape)

    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

def dice_score(prediction: np.array, ground_truth: np.array, smooth=1e-7) -> float:
    '''
    Calculate Dice Score between two binary masks.
    '''
    intersection = np.sum(prediction * ground_truth)
    return (2.0 * intersection + smooth) / (np.sum(prediction) + np.sum(ground_truth) + smooth)


def calculate_dice_scores(ground_truth_df, prediction_df, img_shape=(224, 224)) -> List[float]:
    '''
    Calculate Dice scores for a dataset.
    '''


    # Keep only the rows in the prediction dataframe that have matching img_ids in the ground truth dataframe
    prediction_df = prediction_df[prediction_df.iloc[:, 0].isin(ground_truth_df.iloc[:, 0])]
    prediction_df.index = range(prediction_df.shape[0])


    # Extract the mask_rle columns
    pred_mask_rle = prediction_df.iloc[:, 1]
    gt_mask_rle = ground_truth_df.iloc[:, 1]


    def calculate_dice(pred_rle, gt_rle):
        pred_mask = rle_decode(pred_rle, img_shape)
        gt_mask = rle_decode(gt_rle, img_shape)


        if np.sum(gt_mask) > 0 or np.sum(pred_mask) > 0:
            return dice_score(pred_mask, gt_mask)
        else:
            return None  # No valid masks found, return None


    dice_scores = Parallel(n_jobs=-1)(
        delayed(calculate_dice)(pred_rle, gt_rle) for pred_rle, gt_rle in zip(pred_mask_rle, gt_mask_rle)
    )


    dice_scores = [score for score in dice_scores if score is not None]  # Exclude None values


    return np.mean(dice_scores)

# RLE 인코딩 함수
def rle_encode(mask):
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

# **Custom Dataset**

In [None]:
class SatelliteDataset(Dataset):
    def __init__(self, csv_file, transform=None, infer=False):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        self.infer = infer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = self.data.iloc[idx, 1]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.infer:
            if self.transform:
                image = self.transform(image=image)['image']
            return image

        mask_rle = self.data.iloc[idx, 2]
        mask = rle_decode(mask_rle, (image.shape[0], image.shape[1]))

        if self.transform:
            augmented = self.transform(image=image, mask=mask)
            image = augmented['image']
            mask = augmented['mask']

        return image, mask

# **Data Loader**

# Dataset Info.

### train_img [폴더]
TRAIN_0000.png ~ TRAIN_7139.png


### test_img [폴더]
TEST_00000.png ~ TEST_60639.png


### train.csv [파일]
img_id : 학습 위성 이미지 샘플 ID
img_path : 학습 위성 이미지 경로 (상대 경로)
mask_rle : RLE 인코딩된 이진마스크(0 : 배경, 1 : 건물) 정보
학습 위성 이미지에는 반드시 건물이 포함되어 있습니다.
그러나 추론 위성 이미지에는 건물이 포함되어 있지 않을 수 있습니다.
학습 위성 이미지의 해상도는 0.5m/픽셀이며, 추론 위성 이미지의 해상도는 공개하지 않습니다.


### test.csv [파일]
img_id : 추론 위성 이미지 샘플 ID
img_path : 추론 위성 이미지 경로 (상대 경로)


### sample_submission.csv [파일] - 제출 양식
img_id : 추론 위성 이미지 샘플 ID
mask_rle : RLE 인코딩된 예측 이진마스크(0: 배경, 1 : 건물) 정보
단, 예측 결과에 건물이 없는 경우 반드시 -1 처리

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
path = "/content/gdrive/My Drive/ai_dataset"

file_list = os.listdir(path)
file_list_py = [file for file in file_list]

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
transform = A.Compose(
    [
        A.Resize(224, 224),
        A.RandomRotate90(),
        A.Flip(),
        A.OneOf([
            A.IAAAdditiveGaussianNoise(),
            A.GaussNoise(),
        ], p=0.5),
        A.HueSaturationValue(),
        A.CLAHE(),
        A.OpticalDistortion(),
        A.RandomContrast(),
        A.RandomBrightness(),
        A.IAAEmboss(),
        A.MotionBlur(),
        A.Normalize(),
        ToTensorV2()
    ]
)

dataset = SatelliteDataset(csv_file='/content/gdrive/MyDrive/ai_dataset/train.csv', transform=transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=4)



In [None]:
import sklearn.model_selection import train_test_split

# 순차적 추출


# **Define Model**

In [None]:
# DeepLabV3+ 구조 정의
class DeepLabV3Plus(nn.Module):
    def __init__(self, num_classes):
        super(DeepLabV3Plus, self).__init__()
        self.backbone = models.resnet101(pretrained=True) # 할 수 있으면 backbone network Xception으로 수정
        self.asspp = ASSPP() # Astrous Separable Spatial Pyramid convolution
        self.decoder = Decoder() # Decoder
        '''
        구조의 마지막 layer인 로짓 생성 레이어의 정의부
        ASSPP와 Decoder를 거친 feature map을 입력으로 받아 클래스 수에 해당하는 출력 채널 수를 가진 로짓 생성
        '''
        self.logits = nn.Conv2d(256, num_classes, kernel_size=1)

    def forward(self, x):
        # backbone network - 입력 이미지에 대한 초기 전처리
        x = self.backbone.conv1(x)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)

        # 연속된 layer를 통해 특징 추출
        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)

        # segmentation
        feat1, x = self.asspp(x)
        x = self.decoder(feat1, x)
        x = self.logits(x)

        return x

class ASSPP(nn.Module): # Atrous Separable Spatial Pyramid Pooling
    def __init__(self, in_channels=2048, out_channels=256):
        super(ASSPP, self).__init__()
        dilations = [1, 6, 12, 18] # rate

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1) # Atrous Conv
        self.conv2 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1,  padding=dilations[0] + 1, dilation=dilations[0]) # 1x1 Conv
        self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=dilations[1] + 1, dilation=dilations[1]) # 3X3 Conv rate 6
        self.conv4 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=dilations[2] + 1, dilation=dilations[2]) # 3x3 Conv rate 12
        self.conv5 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=dilations[3] + 1, dilation=dilations[3]) # 3x3 Conv rate 18

        # depthwise separable convolution
        self.sep_conv1 = nn.Conv2d(out_channels * 5, out_channels, kernel_size=1) # 입력 채널 수 줄이기
        self.sep_conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, groups=out_channels) # 각 채널 개별적으로 처리
        self.sep_conv3 = nn.Conv2d(out_channels, out_channels, kernel_size=1) # 출력 채널 수를 out_channels로 줄이기

        self.relu = nn.ReLU()

    def forward(self, x):
        '''
        convolution을 통해 처리된 feature map 결과들을 channel 차원을 기준으로 결합(torch.cat 이용)
        '''
        feat1 = self.conv1(x)
        feat2 = self.conv2(x)
        feat3 = self.conv3(x)
        feat4 = self.conv4(x)
        feat5 = self.conv5(x)

        # 특성 맵들의 크기를 동일하게 조정한 후 concat
        feat1 = nn.AdaptiveAvgPool2d((feat5.size(2), feat5.size(3)))(feat1)
        feat2 = nn.AdaptiveAvgPool2d((feat5.size(2), feat5.size(3)))(feat2)
        feat3 = nn.AdaptiveAvgPool2d((feat5.size(2), feat5.size(3)))(feat3)
        feat4 = nn.AdaptiveAvgPool2d((feat5.size(2), feat5.size(3)))(feat4)

        out = torch.cat([feat1, feat2, feat3, feat4, feat5], dim=1)

        # 결합된 특성 맵은 sep_conv1을 통해 채널 방향으로 축소
        # sep_conv2는 3x3커널을 사용, 그룹 내에서 채널별로 분리하여 공간 방향의 특성을 잡아냄
        # sep_conv3를 통해 특성을 다시 축소한 후 ReLU 활성화 함수 적용해 최종 출력 얻어냄
        out = self.sep_conv1(out)
        out = self.sep_conv2(out)
        out = self.sep_conv3(out)
        out = self.relu(out)
        return feat1, out

class Decoder(nn.Module):
    def __init__(self, in_channels=256, out_channels=256, low_level_channels=256):
        super(Decoder, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1)
        self.conv2 = nn.Conv2d(out_channels + low_level_channels, out_channels, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.relu = nn.ReLU()

    def forward(self, x, low_level_features):
        out = self.conv1(x)
        out = self.relu(out)
        out = F.interpolate(out, scale_factor=4, mode='bilinear', align_corners=False)

        # low_level_features 크기를 out과 일치시키기 위해 조정
        low_level_features = nn.AdaptiveAvgPool2d((out.size(2), out.size(3)))(low_level_features)

        out = torch.cat([out, low_level_features], dim=1)
        out = self.conv2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.relu(out)

        return out

# **Model Train**

In [None]:
# model 초기화
model = DeepLabV3Plus(18).to(device)

# loss function과 optimizer 정의
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# training loop
for epoch in range(10):  # 10 에폭 동안 학습합니다.
    model.train()
    epoch_loss = 0
    for images, masks in tqdm(dataloader):
        images = images.float().to(device)
        masks = masks.float().to(device)

        optimizer.zero_grad()
        outputs = model(images)
        '''
        출력 크기는 모델의 클래스 수에 해당하는 차원
        목표 크기는 이진 분류 작업을 수행하기 위해 채널 차원 1
        '''

        loss = criterion(outputs, masks.unsqueeze(1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {epoch_loss/len(dataloader)}')



OutOfMemoryError: ignored

# **Inference**

In [None]:
test_dataset = SatelliteDataset(csv_file='/content/gdrive/MyDrive/ai_dataset/test.csv', transform=transform, infer=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4)

In [None]:
with torch.no_grad():
    model.eval()
    result = []
    for images in tqdm(test_dataloader):
        images = images.float().to(device)

        outputs = model(images)
        masks = torch.sigmoid(outputs).cpu().numpy()
        masks = np.squeeze(masks, axis=1)
        masks = (masks > 0.35).astype(np.uint8) # Threshold = 0.35

        for i in range(len(images)):
            mask_rle = rle_encode(masks[i])
            if mask_rle == '': # 예측된 건물 픽셀이 아예 없는 경우 -1
                result.append(-1)
            else:
                result.append(mask_rle)

# **Submisssion**

In [None]:
submit = pd.read_csv('/content/gdrive/MyDrive/ai_dataset/sample_submission.csv')
submit['mask_rle'] = result

In [None]:
submit.to_csv('/content/gdrive/MyDrive/ai_dataset/submit.csv', index=False)