In [1]:
import os
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import torchvision
from tqdm import tqdm

In [2]:
# Dataset for image + mask pairs
class MaskedSegDataset(Dataset):
    def __init__(self, root, img_folder="images", mask_folder="binary_mask",
                 transform=None, mask_transform=None):

        self.img_dir = os.path.join(root, img_folder)
        self.mask_dir = os.path.join(root, mask_folder)
        self.transform = transform
        self.mask_transform = mask_transform

        # list image files
        self.images = [
            f for f in os.listdir(self.img_dir)
            if f.lower().endswith((".jpg", ".jpeg", ".png"))
        ]

        print(f"Found {len(self.images)} images.")

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_name = self.images[idx]
        stem = os.path.splitext(img_name)[0]

        img_path = os.path.join(self.img_dir, img_name)
        mask_path = os.path.join(self.mask_dir, stem + ".png")

        image = Image.open(img_path).convert("RGB")
        mask = Image.open(mask_path)
        mask = np.array(mask, dtype=np.int64)

        if self.transform:
            image = self.transform(image)

        if self.mask_transform:
            mask_pil = Image.fromarray(mask.astype(np.uint8), mode="L")
            mask_pil = self.mask_transform(mask_pil)
            mask = np.array(mask_pil, dtype=np.int64)

        mask = torch.tensor(mask, dtype=torch.long)

        return image, mask



In [7]:
# Training Script
def main():

    #path
    dataset_path = r"C:\Users\dht233\OneDrive - University of Texas at San Antonio\NSF\Housing condition\image segmatation\dataset\translated_data"

    #  HYPERPARAMS 
    IMG_SIZE = 512
    BATCH_SIZE = 2
    NUM_EPOCHS = 10
    LR = 1e-4
    NUM_CLASSES = 10  # 0–9

    # DEVICE 
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    # ---------- TRANSFORMS ----------
    img_transform = T.Compose([
        T.Resize((IMG_SIZE, IMG_SIZE)),
        T.RandomHorizontalFlip(),
        T.RandomRotation(10),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225]),
    ])

    mask_transform = T.Compose([
        T.Resize((IMG_SIZE, IMG_SIZE), interpolation=Image.NEAREST)
    ])

    #  DATASET 
    train_dataset = MaskedSegDataset(
        dataset_path,
        img_folder="images",
        mask_folder="binary_mask",
        transform=img_transform,
        mask_transform=mask_transform
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=0
    )

    # MODEL
    model = torchvision.models.segmentation.deeplabv3_resnet50(
        weights=None, num_classes=NUM_CLASSES
    ).to(device)

    #LOSS + OPT 
    criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    # TRAIN 
    for epoch in range(NUM_EPOCHS):
        model.train()
        running_loss = 0.0

        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")

        for images, masks in pbar:
            images = images.to(device)
            masks = masks.to(device)

            optimizer.zero_grad()
            outputs = model(images)["out"]

            loss = criterion(outputs, masks)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)

            pbar.set_postfix({"loss": loss.item()})

        epoch_loss = running_loss / len(train_dataset)
        print(f"Epoch {epoch+1} Loss: {epoch_loss:.4f}")

    torch.save(model.state_dict(), "seg_model.pth")
    print("Model saved as seg_model.pth")

In [8]:
if __name__ == "__main__":
    main()

Using device: cuda
Found 642 images.


Epoch 1/10: 100%|██████████| 321/321 [03:25<00:00,  1.56it/s, loss=0.941]


Epoch 1 Loss: 1.1871


Epoch 2/10: 100%|██████████| 321/321 [03:32<00:00,  1.51it/s, loss=0.699]


Epoch 2 Loss: 0.9048


Epoch 3/10: 100%|██████████| 321/321 [03:32<00:00,  1.51it/s, loss=0.798]


Epoch 3 Loss: 0.8627


Epoch 4/10: 100%|██████████| 321/321 [03:24<00:00,  1.57it/s, loss=1.04] 


Epoch 4 Loss: 0.8345


Epoch 5/10: 100%|██████████| 321/321 [03:32<00:00,  1.51it/s, loss=1.11] 


Epoch 5 Loss: 0.8025


Epoch 6/10: 100%|██████████| 321/321 [03:29<00:00,  1.53it/s, loss=0.714]


Epoch 6 Loss: 0.7839


Epoch 7/10: 100%|██████████| 321/321 [03:28<00:00,  1.54it/s, loss=0.662]


Epoch 7 Loss: 0.7823


Epoch 8/10: 100%|██████████| 321/321 [03:32<00:00,  1.51it/s, loss=1.18] 


Epoch 8 Loss: 0.7543


Epoch 9/10: 100%|██████████| 321/321 [03:32<00:00,  1.51it/s, loss=0.602]


Epoch 9 Loss: 0.7324


Epoch 10/10: 100%|██████████| 321/321 [03:29<00:00,  1.53it/s, loss=0.414]


Epoch 10 Loss: 0.7092
Model saved as seg_model.pth


###Inference

In [1]:
#pth file should be in the same folder as inference.py
import os
import torch
import torchvision.transforms as T
import torchvision
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

# PARAMETERS 
device = "cpu"
MODEL_PATH = "seg_model.pth"
IMG_SIZE = 512
NUM_CLASSES = 10  # segmentation classes (0-9)

# YOUR FOLDER WITH IMAGES
TEST_FOLDER = r"C:\Users\dht233\OneDrive - University of Texas at San Antonio\NSF\Housing condition\image segmatation\myima"

#  LOAD MODEL 
model = torchvision.models.segmentation.deeplabv3_resnet50(
    weights=None, num_classes=NUM_CLASSES
)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.to(device)
model.eval()

# TRANSFORMS 
transform = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]),
])

#COLOR MAP (your colors)
colors = {
    0: [0, 0, 0],
    1: [70, 70, 70],
    2: [250, 170, 30],
    3: [70, 130, 180],
    4: [0, 60, 100],
    5: [153, 153, 153],
    6: [107, 142, 35],
    7: [255, 0, 0],
    8: [0, 0, 142],
    9: [220, 220, 0],
}


def colorize_mask(mask):
    h, w = mask.shape
    color_mask = np.zeros((h, w, 3), dtype=np.uint8)
    for cls, color in colors.items():
        color_mask[mask == cls] = color
    return color_mask

# PREDICT FUNCTION 
def predict(img_path):
    img = Image.open(img_path).convert("RGB")
    x = transform(img).unsqueeze(0).to(device)

    with torch.no_grad():
        out = model(x)["out"]
        pred = torch.argmax(out.squeeze(), dim=0).cpu().numpy()

    return img, pred
# RUN INFERENCE ON ENTIRE FOLDER 
def run_inference():
    # get all images in folder
    imgs = [os.path.join(TEST_FOLDER, f)
            for f in os.listdir(TEST_FOLDER)
            if f.lower().endswith((".jpg", ".png", ".jpeg"))]

    print("Found", len(imgs), "images in folder.")

    if len(imgs) == 0:
        print("No images found. Check folder path.")
        return

    # only infer one image, prevent kernel crash
    img_path = imgs[0]
    print("Predicting:", img_path)

    img, pred = predict(img_path)
    c_pred = colorize_mask(pred)

    plt.figure(figsize=(12, 4))

    plt.subplot(1, 3, 1)
    plt.title("Image")
    plt.imshow(img)
    plt.axis("off")

    plt.subplot(1, 3, 2)
    plt.title("Mask (0-9)")
    plt.imshow(pred, cmap="gray")
    plt.axis("off")

    plt.subplot(1, 3, 3)
    plt.title("Color Mask")
    plt.imshow(c_pred)
    plt.axis("off")

    plt.show()


In [None]:
if __name__ == "__main__":
    run_inference()

Found 3 images in folder.
Predicting: C:\Users\dht233\OneDrive - University of Texas at San Antonio\NSF\Housing condition\image segmatation\myima\frontview_pic_20250627093308 (1).jpg
