In [1]:
import os, glob
from PIL import Image
import numpy as np
import torch
from torchvision import transforms


root = r"D:/datasets/PennFudanPed"  
img_dir = os.path.join(root, "PNGImages")
mask_dir = os.path.join(root, "PedMasks")

# === Hàm xử lý mặt nạ để lấy hộp bao và mask ===
def load_target(mask_p):
    mask = np.array(Image.open(mask_p))
    obj_ids = np.unique(mask)[1:]  # loại bỏ background = 0
    masks = (mask[..., None] == obj_ids).astype(np.uint8).transpose(2,0,1)
    boxes = []
    for m in masks:
        pos = np.argwhere(m)
        y1, x1 = pos.min(0)
        y2, x2 = pos.max(0)
        boxes.append([x1, y1, x2, y2])
    boxes = torch.as_tensor(boxes, dtype=torch.float32)
    labels = torch.ones((len(boxes),), dtype=torch.int64)  # class=1 (person)
    masks = torch.as_tensor(masks, dtype=torch.uint8)
    return boxes, labels, masks

# === Tạo thư mục crops để lưu ảnh cắt 64x64 dùng cho CNN/AE/GAN ===
crop_dir = os.path.join(root, "crops64")
os.makedirs(crop_dir, exist_ok=True)

to_tensor = transforms.ToTensor()
resize64 = transforms.Resize((64,64), interpolation=transforms.InterpolationMode.BILINEAR)

# === Duyệt qua từng ảnh và cắt người theo mask ===
for img_p in glob.glob(os.path.join(img_dir, "*.png")):
    base = os.path.basename(img_p).replace(".png", "")
    mask_p = os.path.join(mask_dir, base + "_mask.png")
    if not os.path.exists(mask_p):
        continue
    img = Image.open(img_p).convert("RGB")
    boxes, _, _ = load_target(mask_p)
    for i, b in enumerate(boxes):
        x1, y1, x2, y2 = map(int, b.tolist())
        crop = img.crop((x1, y1, x2, y2))
        crop = resize64(crop)
        crop.save(os.path.join(crop_dir, f"{base}_{i}.png"))

print(f"✅ Đã tạo ảnh cắt trong thư mục: {crop_dir}")


✅ Đã tạo ảnh cắt trong thư mục: D:/datasets/PennFudanPed\crops64


In [2]:
from torch.utils.data import Dataset, DataLoader
from torchvision import models

class PedCropDataset(Dataset):
    def __init__(self, folder):
        self.paths = sorted(glob.glob(os.path.join(folder, "*.png")))
        self.tf = transforms.Compose([transforms.ToTensor()])
    def __len__(self): return len(self.paths)
    def __getitem__(self, i):
        x = self.tf(Image.open(self.paths[i]).convert("RGB"))
        y = 1  # person
        return x, y

ds = PedCropDataset(crop_dir)
n = len(ds); n_train = int(0.8*n)
train_ds, val_ds = torch.utils.data.random_split(ds, [n_train, n-n_train])
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=32)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = models.resnet18(weights=None, num_classes=2).to(device)

import torch.nn.functional as F
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(3):
    model.train()
    for xb,yb in train_dl:
        xb,yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = F.cross_entropy(logits, yb)
        opt.zero_grad(); loss.backward(); opt.step()
    model.eval()
    with torch.no_grad():
        tot,correct = 0,0
        for xb,yb in val_dl:
            xb,yb = xb.to(device), yb.to(device)
            pred = model(xb).argmax(1)
            tot += yb.numel(); correct += (pred==yb).sum().item()
    print(f"Epoch {epoch+1}: val acc={correct/tot:.3f}")


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [None]:
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection import fasterrcnn_resnet50_fpn

class PennFudanDet(Dataset):
    def __init__(self, img_dir, mask_dir, train=True):
        self.imgs = sorted(glob.glob(os.path.join(img_dir,"*.png")))
        self.mask_dir = mask_dir
        self.train = train
        self.tf = transforms.ToTensor()
    def __len__(self): return len(self.imgs)
    def __getitem__(self, i):
        img_p = self.imgs[i]
        base = os.path.basename(img_p).replace(".png","")
        mask_p = os.path.join(self.mask_dir, base+"_mask.png")
        img = Image.open(img_p).convert("RGB")
        boxes, labels, masks = load_target(mask_p)
        return self.tf(img), {"boxes": boxes, "labels": labels}

full = PennFudanDet(img_dir, mask_dir)
n = len(full); n_train = int(0.8*n)
train_ds, val_ds = torch.utils.data.random_split(full, [n_train, n-n_train])

def collate(batch): 
    imgs, targets = zip(*batch)
    return list(imgs), list(targets)

train_dl = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn=collate)
val_dl   = DataLoader(val_ds, batch_size=2, collate_fn=collate)

det_model = fasterrcnn_resnet50_fpn(weights="DEFAULT")
in_features = det_model.roi_heads.box_predictor.cls_score.in_features
det_model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, 2)
det_model = det_model.to(device)
opt = torch.optim.SGD([p for p in det_model.parameters() if p.requires_grad], lr=0.005, momentum=0.9, weight_decay=1e-4)

for epoch in range(2):
    det_model.train()
    for imgs, targets in train_dl:
        imgs = [im.to(device) for im in imgs]
        targets = [{k:v.to(device) for k,v in t.items()} for t in targets]
        loss_dict = det_model(imgs, targets)
        loss = sum(loss_dict.values())
        opt.zero_grad(); loss.backward(); opt.step()
    print(f"Epoch {epoch+1}: train loss={loss.item():.3f}")


In [None]:
from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

class PennFudanSeg(PennFudanDet):
    def __getitem__(self, i):
        img_p = self.imgs[i]
        base = os.path.basename(img_p).replace(".png","")
        mask_p = os.path.join(self.mask_dir, base+"_mask.png")
        img = Image.open(img_p).convert("RGB")
        boxes, labels, masks = load_target(mask_p)
        return transforms.ToTensor()(img), {"boxes": boxes, "labels": labels, "masks": masks}

train_ds_seg, val_ds_seg = torch.utils.data.random_split(PennFudanSeg(img_dir, mask_dir), [n_train, n-n_train])
train_dl_seg = DataLoader(train_ds_seg, batch_size=2, shuffle=True, collate_fn=collate)

seg_model = maskrcnn_resnet50_fpn(weights="DEFAULT")
# replace the mask head for 2 classes (background + person)
in_features_mask = seg_model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden = 256
seg_model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden, 2)
# replace box predictor too:
in_features = seg_model.roi_heads.box_predictor.cls_score.in_features
seg_model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, 2)
seg_model = seg_model.to(device)

opt = torch.optim.SGD([p for p in seg_model.parameters() if p.requires_grad], lr=0.005, momentum=0.9, weight_decay=1e-4)

for epoch in range(2):
    seg_model.train()
    for imgs, targets in train_dl_seg:
        imgs = [im.to(device) for im in imgs]
        targets = [{k:v.to(device) for k,v in t.items()} for t in targets]
        loss_dict = seg_model(imgs, targets)
        loss = sum(loss_dict.values())
        opt.zero_grad(); loss.backward(); opt.step()
    print(f"[Mask R-CNN] Epoch {epoch+1}: train loss={loss.item():.3f}")


In [None]:
import torch.nn as nn
from torch.utils.data import DataLoader

class CropOnly(Dataset):
    def __init__(self, folder):
        self.paths = sorted(glob.glob(os.path.join(folder, "*.png")))
        self.tf = transforms.Compose([transforms.ToTensor()])
    def __len__(self): return len(self.paths)
    def __getitem__(self, i):
        return self.tf(Image.open(self.paths[i]).convert("RGB"))

ae_ds = CropOnly(crop_dir)
ae_dl = DataLoader(ae_ds, batch_size=64, shuffle=True)

class SmallAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.enc = nn.Sequential(
            nn.Conv2d(3,32,4,2,1), nn.ReLU(),
            nn.Conv2d(32,64,4,2,1), nn.ReLU(),
            nn.Conv2d(64,128,4,2,1), nn.ReLU(),
        )
        self.dec = nn.Sequential(
            nn.ConvTranspose2d(128,64,4,2,1), nn.ReLU(),
            nn.ConvTranspose2d(64,32,4,2,1), nn.ReLU(),
            nn.ConvTranspose2d(32,3,4,2,1), nn.Sigmoid(),
        )
    def forward(self,x): return self.dec(self.enc(x))

ae = SmallAE().to(device)
opt = torch.optim.Adam(ae.parameters(), lr=1e-3)
for epoch in range(3):
    ae.train()
    tot=0
    for xb in ae_dl:
        xb = xb.to(device)
        recon = ae(xb)
        loss = ((recon - xb)**2).mean()
        opt.zero_grad(); loss.backward(); opt.step()
        tot += loss.item()*xb.size(0)
    print(f"AE epoch {epoch+1}: MSE={tot/len(ae_ds):.4f}")
