In [11]:
import os
import xml.etree.ElementTree as ET
import torch
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torch.optim as optim
import pytesseract
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

In [12]:
class ContainerDataset(Dataset):
    def __init__(self, img_dir, annotations_dir, transform=None):
        self.img_dir = img_dir
        self.annotations_dir = annotations_dir
        self.transform = transform
        self.imgs = [f for f in sorted(os.listdir(img_dir)) if f.endswith(('.jpg', '.jpeg', '.png'))]
        self.annotations = [f for f in sorted(os.listdir(annotations_dir)) if f.endswith('.xml')]
    
    def __len__(self):
        return len(self.imgs)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.imgs[idx])
        annotation_path = os.path.join(self.annotations_dir, self.annotations[idx])
        
        img = Image.open(img_path).convert("RGB")
        tree = ET.parse(annotation_path)
        root = tree.getroot()
        
        boxes = []
        for member in root.findall('object'):
            xmin = int(member.find('bndbox').find('xmin').text)
            ymin = int(member.find('bndbox').find('ymin').text)
            xmax = int(member.find('bndbox').find('xmax').text)
            ymax = int(member.find('bndbox').find('ymax').text)
            boxes.append([xmin, ymin, xmax, ymax])
        
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.ones((boxes.shape[0],), dtype=torch.int64)  # only one class (container)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        if self.transform:
            img = self.transform(img)
        
        return img, target

In [13]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

train_dataset = ContainerDataset("D:/test3/container_dataset/train/images", "D:/test3/container_dataset/train_xml", transform=transform)
val_dataset = ContainerDataset("D:/test3/container_dataset/val/images", "D:/test3/container_dataset/val_xml", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

In [14]:
def get_model_instance_segmentation(num_classes):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

model = get_model_instance_segmentation(num_classes=2)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [15]:
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    running_loss = 0.0
    for i, (images, targets) in enumerate(data_loader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        running_loss += losses.item()
        if i % print_freq == 0:
            print(f"Epoch [{epoch}], Iteration [{i}], Loss: {losses.item():.4f}")

    print(f"Epoch [{epoch}] completed. Average Loss: {running_loss / len(data_loader):.4f}")

def compute_iou(box, boxes):
    x1 = np.maximum(box[0], boxes[:, 0])
    y1 = np.maximum(box[1], boxes[:, 1])
    x2 = np.minimum(box[2], boxes[:, 2])
    y2 = np.minimum(box[3], boxes[:, 3])

    inter_area = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
    box_area = (box[2] - box[0]) * (box[3] - box[1])
    boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])

    union_area = box_area + boxes_area - inter_area
    iou = inter_area / union_area
    return iou

def evaluate(model, data_loader, device):
    model.eval()
    true_positive = 0
    false_positive = 0
    false_negative = 0

    all_true_boxes = []
    all_pred_boxes = []
    all_scores = []

    with torch.no_grad():
        for images, targets in data_loader:
            images = list(img.to(device) for img in images)
            outputs = model(images)

            for target, output in zip(targets, outputs):
                true_boxes = target['boxes'].cpu().numpy()
                pred_boxes = output['boxes'].cpu().numpy()
                pred_scores = output['scores'].cpu().numpy()

                all_true_boxes.append(true_boxes)
                all_pred_boxes.append(pred_boxes)
                all_scores.append(pred_scores)

                for pred_box, pred_score in zip(pred_boxes, pred_scores):
                    if pred_score > 0.5:  # 設定置信度閾值
                        if len(true_boxes) == 0:
                            false_positive += 1
                        else:
                            ious = compute_iou(pred_box, true_boxes)
                            max_iou = np.max(ious)
                            if max_iou > 0.5:  # IOU 閾值
                                true_positive += 1
                                true_boxes = np.delete(true_boxes, np.argmax(ious), axis=0)
                            else:
                                false_positive += 1

                false_negative += len(true_boxes)

    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Calculate mAP
    average_precisions = []
    for true_boxes, pred_boxes, scores in zip(all_true_boxes, all_pred_boxes, all_scores):
        ap = calculate_ap(true_boxes, pred_boxes, scores, iou_threshold=0.5)
        average_precisions.append(ap)
    mAP = np.mean(average_precisions)

    return precision, recall, f1_score, mAP

def calculate_ap(true_boxes, pred_boxes, scores, iou_threshold=0.5):
    sorted_indices = np.argsort(-scores)
    pred_boxes = pred_boxes[sorted_indices]
    scores = scores[sorted_indices]

    matched = np.zeros(len(true_boxes), dtype=bool)
    tp = np.zeros(len(pred_boxes), dtype=bool)
    fp = np.zeros(len(pred_boxes), dtype=bool)

    for i, pred_box in enumerate(pred_boxes):
        if len(true_boxes) == 0:
            fp[i] = True
            continue

        ious = compute_iou(pred_box, true_boxes)
        max_iou_idx = np.argmax(ious)
        if ious[max_iou_idx] > iou_threshold and not matched[max_iou_idx]:
            tp[i] = True
            matched[max_iou_idx] = True
        else:
            fp[i] = True

    tp = np.cumsum(tp)
    fp = np.cumsum(fp)
    recalls = tp / len(true_boxes)
    precisions = tp / (tp + fp)

    precisions = np.concatenate(([1.0], precisions, [0.0]))
    recalls = np.concatenate(([0.0], recalls, [1.0]))
    for i in range(len(precisions) - 1, 0, -1):
        precisions[i - 1] = np.maximum(precisions[i - 1], precisions[i])

    indices = np.where(recalls[1:] != recalls[:-1])[0]
    ap = np.sum((recalls[indices + 1] - recalls[indices]) * precisions[indices + 1])
    return ap

In [16]:
# 訓練模型
optimizer = optim.SGD([p for p in model.parameters() if p.requires_grad], lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

In [17]:
num_epochs = 1
for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq=50)
    lr_scheduler.step()
    precision, recall, f1_score, mAP = evaluate(model, val_loader, device=device)
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1_score:.4f}, mAP: {mAP:.4f}')

Epoch [0], Iteration [0], Loss: 1.5064
Epoch [0], Iteration [50], Loss: 0.0945
Epoch [0], Iteration [100], Loss: 0.0720
Epoch [0], Iteration [150], Loss: 0.0509
Epoch [0], Iteration [200], Loss: 0.0461
Epoch [0], Iteration [250], Loss: 0.0664
Epoch [0], Iteration [300], Loss: 0.0461
Epoch [0], Iteration [350], Loss: 0.0401
Epoch [0], Iteration [400], Loss: 0.0472
Epoch [0], Iteration [450], Loss: 0.0265
Epoch [0], Iteration [500], Loss: 0.0297
Epoch [0], Iteration [550], Loss: 0.0219
Epoch [0], Iteration [600], Loss: 0.0371
Epoch [0], Iteration [650], Loss: 0.0231
Epoch [0], Iteration [700], Loss: 0.0257
Epoch [0], Iteration [750], Loss: 0.0291
Epoch [0], Iteration [800], Loss: 0.0137
Epoch [0], Iteration [850], Loss: 0.0292
Epoch [0], Iteration [900], Loss: 0.0172
Epoch [0], Iteration [950], Loss: 0.0135
Epoch [0], Iteration [1000], Loss: 0.0590
Epoch [0], Iteration [1050], Loss: 0.0217
Epoch [0] completed. Average Loss: 0.0467
Precision: 0.9453, Recall: 1.0000, F1-Score: 0.9719, mAP:

In [18]:
torch.save(model.state_dict(), "best_model.pth")

In [19]:
# 加載模型
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

# 在新圖像上進行推理
img, _ = val_dataset[0]
with torch.no_grad():
    prediction = model([img.to(device)])

print(prediction)

[{'boxes': tensor([[ 831.7161,   23.5173, 1034.7919,  103.0599]], device='cuda:0'), 'labels': tensor([1], device='cuda:0'), 'scores': tensor([0.9994], device='cuda:0')}]


In [20]:
# 計算性能指標
def calculate_metrics(model, data_loader, device, iou_threshold=0.5):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for images, targets in data_loader:
            images = list(img.to(device) for img in images)
            outputs = model(images)

            for target, output in zip(targets, outputs):
                true_boxes = target['boxes'].cpu().numpy()
                true_labels = target['labels'].cpu().numpy()

                pred_boxes = output['boxes'].cpu().numpy()
                pred_scores = output['scores'].cpu().numpy()
                pred_labels = output['labels'].cpu().numpy()

                matched_gt = set()
                for pred_box, pred_score, pred_label in zip(pred_boxes, pred_scores, pred_labels):
                    if pred_score > iou_threshold:
                        if len(true_boxes) > 0:
                            ious = compute_iou(pred_box, true_boxes)
                            max_iou_idx = np.argmax(ious)
                            if ious[max_iou_idx] > iou_threshold:
                                all_labels.append(true_labels[max_iou_idx])
                                all_preds.append(pred_label)
                                matched_gt.add(max_iou_idx)
                            else:
                                all_labels.append(0)
                                all_preds.append(pred_label)
                        else:
                            all_labels.append(0)
                            all_preds.append(pred_label)

                for i in range(len(true_boxes)):
                    if i not in matched_gt:
                        all_labels.append(true_labels[i])
                        all_preds.append(0)

    precision, recall, f1_score, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

    return precision, recall, f1_score

# 使用訓練集進行評估
precision, recall, f1_score = calculate_metrics(model, train_loader, device)
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1_score:.4f}')

Precision: 0.9856, Recall: 1.0000, F1-Score: 0.9928


In [22]:
def evaluate_model_performance(model, data_loader, device):
    # 計算 mAP
    precision, recall, f1_score, mAP = evaluate(model, data_loader, device)
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1_score:.4f}, mAP: {mAP:.4f}')

# 使用驗證集進行綜合性能評估
evaluate_model_performance(model, train_loader, device)

Precision: 0.9856, Recall: 1.0000, F1-Score: 0.9928, mAP: 1.0000
