# ***Load Pre-train Model***

In [6]:
from torchvision.models.detection import fasterrcnn_resnet50_fpn
model = fasterrcnn_resnet50_fpn(pretrain=True)

In [7]:
model

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
          (relu

# ***Read Dataset***

In [8]:
train_img_path = "資料集/貨櫃資料集_縮放/訓練集_image"
val_img_path   = "資料集/貨櫃資料集_縮放/驗證集_image"

train_label_path = "資料集/貨櫃資料集_縮放/訓練集_xml"
val_label_path   = "資料集/貨櫃資料集_縮放/驗證集_xml"

In [9]:
# ContainerDataset
from package.function import ContainerDataset,Transform
from torch.utils.data import DataLoader
import torch

In [10]:
train_dataset = ContainerDataset(train_img_path, train_label_path, transform=Transform())
val_dataset   = ContainerDataset(val_img_path, val_label_path, transform=Transform())

In [11]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=False)
val_loader   = DataLoader(val_dataset, batch_size=4, shuffle=False)

# ***Train Model***

In [14]:
import torch.nn as nn
import torch.optim as optim

In [15]:
epoch_size = 200
CUDA = True
patience = 20
save_path = 'best_model/Rcnn.pth'
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [16]:
print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x0000026572222710>


In [17]:
import sys
import time

model.train()
# 移动模型到 GPU
if CUDA:
    model = model.cuda()
loss,patime = 100,0

for epoch in range(epoch_size):
    train_loss = 0.0
    for batch_idx, (images, targets) in enumerate(train_loader):
        sys.stdout.write(f'\rEpoch [{epoch+1}/{epoch_size}],目前循環次數: [{batch_idx+1}/{len(train_loader)}]')
        # 调整 targets 的格式
        targets = [{'boxes': t[0].unsqueeze(0), 'labels': torch.ones((1,), dtype=torch.int64)} for t in targets]

        if CUDA:
            images = images.cuda()
            targets = [{k: v.cuda() for k, v in t.items()} for t in targets]



        # clear gradient
        optimizer.zero_grad()
        # Forward propagation
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        # Calculate gradients
        losses.backward()
        # Update parameters
        optimizer.step()
        train_loss += losses.item()
    # 驗證
    val_loss,len_val = 0.0,len(val_loader)
    for valbatch_idx, (imgs, tgs) in enumerate(val_loader):
        tgs = [{'boxes': t[0].unsqueeze(0), 'labels': torch.ones((1,), dtype=torch.int64)} for t in tgs]
        if CUDA:
            imgs = imgs.cuda()
            tgs = [{k: v.cuda() for k, v in t.items()} for t in tgs]
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            val_loss += losses.item()
    val_loss = val_loss / len_val
    # 
    print(f'\nEpoch [{epoch+1}/{epoch_size}], train_Loss: {train_loss/len(train_loader):.4f}, val_Loss: {val_loss:.4f}')
    # EarlyStopping
    if loss > val_loss:
        patime = 0
        loss = val_loss
        torch.save(model, save_path)
    else:
        patime += 1 
    if patime == patience:
        break

Epoch [1/200],目前循環次數: [1/532]

  return F.conv2d(input, weight, bias, self.stride,


Epoch [1/200],目前循環次數: [532/532]
Epoch [1/200], train_Loss: 0.3027, val_Loss: 0.0929
Epoch [2/200],目前循環次數: [532/532]
Epoch [2/200], train_Loss: 0.0824, val_Loss: 0.0198
Epoch [3/200],目前循環次數: [532/532]
Epoch [3/200], train_Loss: 0.0584, val_Loss: 0.0347
Epoch [4/200],目前循環次數: [532/532]
Epoch [4/200], train_Loss: 0.0569, val_Loss: 0.0308
Epoch [5/200],目前循環次數: [532/532]
Epoch [5/200], train_Loss: 0.0489, val_Loss: 0.0309
Epoch [6/200],目前循環次數: [532/532]
Epoch [6/200], train_Loss: 0.0456, val_Loss: 0.0273
Epoch [7/200],目前循環次數: [532/532]
Epoch [7/200], train_Loss: 0.0401, val_Loss: 0.0205
Epoch [8/200],目前循環次數: [532/532]
Epoch [8/200], train_Loss: 57007.6350, val_Loss: 0.3557
Epoch [9/200],目前循環次數: [532/532]
Epoch [9/200], train_Loss: 87122.1412, val_Loss: 222.1700
Epoch [10/200],目前循環次數: [532/532]
Epoch [10/200], train_Loss: 430.5008, val_Loss: 9.4225
Epoch [11/200],目前循環次數: [532/532]
Epoch [11/200], train_Loss: 233.8740, val_Loss: 5.6312
Epoch [12/200],目前循環次數: [532/532]
Epoch [12/200], train_Los

# ***評估 測試集性能***

In [18]:
# model性能評估
import torch
from package.function import *
# 用最佳model進行物件偵測
model = torch.load('best_model/Rcnn.pth')

folderpath = '資料集/貨櫃資料集/測試集/'
filenames = find_filename(folder_path=folderpath,File_extension='jpg')[0]
file = [folderpath+filename for filename in filenames]
images = load_images(file)
results = predict_images(model, images,'cuda')

In [19]:
for i, result in enumerate(results):
    print(f"Image {i+1} Predictions:")
    print(result)

Image 1 Predictions:
[{'boxes': tensor([], device='cuda:0', size=(0, 4)), 'labels': tensor([], device='cuda:0', dtype=torch.int64), 'scores': tensor([], device='cuda:0')}]
Image 2 Predictions:
[{'boxes': tensor([], device='cuda:0', size=(0, 4)), 'labels': tensor([], device='cuda:0', dtype=torch.int64), 'scores': tensor([], device='cuda:0')}]
Image 3 Predictions:
[{'boxes': tensor([], device='cuda:0', size=(0, 4)), 'labels': tensor([], device='cuda:0', dtype=torch.int64), 'scores': tensor([], device='cuda:0')}]
Image 4 Predictions:
[{'boxes': tensor([], device='cuda:0', size=(0, 4)), 'labels': tensor([], device='cuda:0', dtype=torch.int64), 'scores': tensor([], device='cuda:0')}]
Image 5 Predictions:
[{'boxes': tensor([], device='cuda:0', size=(0, 4)), 'labels': tensor([], device='cuda:0', dtype=torch.int64), 'scores': tensor([], device='cuda:0')}]
Image 6 Predictions:
[{'boxes': tensor([], device='cuda:0', size=(0, 4)), 'labels': tensor([], device='cuda:0', dtype=torch.int64), 'scores'

In [20]:
from package.function import *
predictions = predictions_to_coco_json_ssd(results,file)
xml_path = '資料集/貨櫃資料集/測試集_xml/'
ground_truths = xmls_to_coco_json(xml_path)

In [21]:
metric = compute_metrics(predictions, ground_truths)

creating index...
index created!
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.24s).
Accumulating evaluation results...
DONE (t=0.04s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.288
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.723
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.112
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.366
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.251
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.342
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.366
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.366
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 

In [22]:
metric

{'mAP50': 0.7231657894128081,
 'mAP50-95': 0.08143047949930274,
 'Precision': 0.2883232414592709,
 'Recall': 0.36596026490066225,
 'F1-Score': 0.3225355638520613}

# ***圖片準確率測試集 貨櫃編號偵測***

In [6]:
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from package.function import *
import easyocr,cv2,re

# 初始化 easyOCR
reader = easyocr.Reader(['en'], gpu=False)

def remove_non_alphanumeric_and_uppercase(s):
    # 使用正則表達式過濾掉非英文和數字的字符
    cleaned_str = re.sub(r'[^a-zA-Z0-9]', '', s)
    # 將剩餘的字串轉換為大寫
    return cleaned_str.upper()

def visualize_detections(results, filepaths, original_images):
    for i, (result, img_tensor) in enumerate(zip(results, original_images)):
        img = (img_tensor.permute(1, 2, 0).cpu().numpy()*255).astype('uint8')
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.figure(figsize=(12, 8))
        plt.imshow(img_rgb)
        ax = plt.gca()

        for box, score in zip(result['boxes'].cpu(), result['scores'].cpu()):
            if score >0.5:
                x1, y1, x2, y2 = box.tolist()
                width, height = x2 - x1, y2 - y1

                rect = Rectangle((x1, y1), width, height, linewidth=2, edgecolor='red', facecolor='none')
                ax.add_patch(rect)

                crop_img = img_rgb[int(y1):int(y2), int(x1):int(x2)]

                try:
                    result_ocr = reader.readtext(crop_img)
                    result_ocr = [remove_non_alphanumeric_and_uppercase(res[1]) for res in result_ocr]
                    if len(result_ocr) > 0:
                        recognized_text = result_ocr[0]
                        if len(recognized_text) > 11:
                            recognized_text = recognized_text[:11]
                    else:
                        recognized_text = 'No Text'
                except Exception as e:
                    print("OCR failed:", e)
                    recognized_text = 'OCR Error'

                ax.text(x1, y2, f'{recognized_text}', color='yellow', fontsize=12, bbox=dict(facecolor='blue', alpha=0.5))

        plt.axis('off')
        plt.savefig(filepaths[i])
        plt.close()


Using CPU. Note: This module is much faster with a GPU.


In [7]:
# 圖片預測
import torch
from package.function import *
# 用最佳model進行物件偵測
model = torch.load("best_model/Rcnn.pth")

folderpath = '資料集/圖片準確率測試集/'
filenames = find_filename(folder_path=folderpath,File_extension='jpg')[0]
file = [folderpath+filename for filename in filenames]


# torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"
model = model.to(device)
images = load_images(file)
images = [image.to(device) for image in images]  # 將圖像移動到相同的設備

model.eval()
results = model(images)

# images = load_images(file)
visualize_detections(results,[folderpath+'object_rcnn/'+filename for filename in filenames],images)

# ***影片 貨櫃編號偵測***

In [11]:
import cv2,re,easyocr
from package.function import *
from torchvision.transforms import functional as F

reader = easyocr.Reader(['en'], gpu=False)

def remove_non_alphanumeric_and_uppercase(s):
    # 使用正則表達式過濾掉非英文和數字的字符
    cleaned_str = re.sub(r'[^a-zA-Z0-9]', '', s)
    # 將剩餘的字串轉換為大寫
    return cleaned_str.upper()

# 處理影片
def process_video(video_path, save_path, model, device):
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(save_path, fourcc, fps, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img_tensor = F.to_tensor(img_rgb).unsqueeze(0).to(device)

        with torch.no_grad():
            results = model(img_tensor)

        output_frame = draw_detections(frame, results, device)
        out.write(output_frame)

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# 在影像中繪製偵測結果
def draw_detections(frame, results, device):
    if results:
        result = results[0]
        boxes = result['boxes'].cpu().numpy()
        labels = result['labels'].cpu().numpy()
        scores = result['scores'].cpu().numpy()

        for box, score, label in zip(boxes, scores, labels):
            if score < 0.5:  # 設定一個分數閾值
                continue

            x1, y1, x2, y2 = map(int, box)

            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)

            crop_img = frame[y1:y2, x1:x2]

            try:
                result_ocr = reader.readtext(crop_img)
                if result_ocr:
                    recognized_text = ' '.join([remove_non_alphanumeric_and_uppercase(res[1]) for res in result_ocr])

                    if len(recognized_text) > 11:
                        recognized_text = recognized_text[:11]
                else:
                    recognized_text = 'No Text'
            except Exception as e:
                print("OCR failed:", e)
                recognized_text = 'OCR Error'

            cv2.putText(frame, recognized_text, (x1, y2 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 0), 2)

    return frame

Using CPU. Note: This module is much faster with a GPU.


In [12]:
import torch
from package.function import *
model = torch.load("best_model/Rcnn.pth")
filenames = find_filename(folder_path='資料集/影片資料集/',File_extension='avi')[0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()
for filename in filenames:
    # 影片路徑
    video_path = f'資料集/影片資料集/{filename}'

    save_path = f'資料集/影片資料集/output_rcnn/{filename}'
    # 處理影片
    process_video(video_path,save_path, model,device)