### 使用データセットアドレス：
https://www.kaggle.com/datasets/ultralytics/coco128

In [ ]:
import torch
from torch.utils.data import Dataset, random_split, DataLoader
import os
from PIL import Image
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import torch
import torchvision
import torch.nn as nn
from matplotlib.font_manager import FontProperties

# YOLOデータセットクラス
class YOLODataset(Dataset):
    def __init__(self, image_dir, label_dir, img_size=(640, 640), classes=80):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.img_size = img_size
        self.classes = classes

        # 画像とラベルファイルのパスを取得してソート
        self.image_files = sorted([os.path.join(image_dir, file) for file in os.listdir(image_dir)])
        self.label_files = sorted([os.path.join(label_dir, file) for file in os.listdir(label_dir)])

        # 画像の変換操作を定義
        self.transform = transforms.Compose([
            transforms.Resize(img_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_path = self.image_files[idx]
        label_path = self.label_files[idx]

        # 画像を読み込んで変換
        image = Image.open(image_path).convert('RGB')
        image = self.transform(image)

        # ラベルを読み込み
        labels = []
        with open(label_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                parts = line.strip().split()
                class_id = int(parts[0])
                x_center, y_center, width, height = map(float, parts[1:])
                labels.append([class_id, x_center, y_center, width, height])
        labels = torch.tensor(labels)

        return image, labels


# カスタムcollate関数、異なる長さのラベルを処理するため
def custom_collate_fn(batch):
    images = []
    labels = []
    for image, label in batch:
        images.append(image)
        labels.append(label)

    # 画像テンソルにtorch.stackを使用
    images = torch.stack(images, dim=0)
    # ラベルテンソルはリストで格納
    labels = labels  

    return images, labels


# データセットパスとクラスリスト
train_image_dir = '../datasets/coco8/images/train'  #具体的な需要に応じてファイルアドレスを修正
train_label_dir = '../datasets/coco8/labels/train'
val_image_dir = '../datasets/coco8/images/val'
val_label_dir = '../datasets/coco8/labels/val'

classes = {
    0: 'person 人', 1: 'bicycle 自転車', 2: 'car 車', 3: 'motorcycle オートバイ', 
    4: 'airplane 飛行機', 5: 'bus バス', 6: 'train 電車', 7: 'truck トラック', 
    8: 'boat ボート', 9: 'traffic light 信号機', 10: 'fire hydrant 消火栓', 
    11: 'stop sign 停止標識', 12: 'parking meter パーキングメーター', 13: 'bench ベンチ',
    14: 'bird 鳥', 15: 'cat 猫', 16: 'dog 犬', 17: 'horse 馬', 
    18: 'sheep 羊', 19: 'cow 牛', 20: 'elephant 象', 21: 'bear 熊', 
    22: 'zebra シマウマ', 23: 'giraffe キリン', 24: 'backpack バックパック', 
    25: 'umbrella 傘', 26: 'handbag ハンドバッグ', 27: 'tie ネクタイ', 
    28: 'suitcase スーツケース', 29: 'frisbee フリスビー', 30: 'skis スキー板', 
    31: 'snowboard スノーボード', 32: 'sports ball スポーツボール', 33: 'kite 凧', 
    34: 'baseball bat 野球バット', 35: 'baseball glove 野球グローブ', 
    36: 'skateboard スケートボード', 37: 'surfboard サーフボード', 
    38: 'tennis racket テニスラケット', 39: 'bottle ボトル', 
    40: 'wine glass ワイングラス', 41: 'cup カップ', 42: 'fork フォーク', 
    43: 'knife ナイフ', 44: 'spoon スプーン', 45: 'bowl ボウル', 46: 'banana バナナ', 
    47: 'apple りんご', 48: 'sandwich サンドイッチ', 49: 'orange オレンジ', 
    50: 'broccoli ブロッコリー', 51: 'carrot ニンジン', 52: 'hot dog ホットドッグ', 
    53: 'pizza ピザ', 54: 'donut ドーナツ', 55: 'cake ケーキ', 
    56: 'chair 椅子', 57: 'couch ソファ', 58: 'potted plant 鉢植え', 
    59: 'bed ベッド', 60: 'dining table ダイニングテーブル', 61: 'toilet トイレ', 
    62: 'tv テレビ', 63: 'laptop ノートパソコン', 64: 'mouse マウス', 
    65: 'remote リモコン', 66: 'keyboard キーボード', 67: 'cell phone 携帯電話', 
    68: 'microwave 電子レンジ', 69: 'oven オーブン', 70: 'toaster トースター', 
    71: 'sink シンク', 72: 'refrigerator 冷蔵庫', 73: 'book 本', 
    74: 'clock 時計', 75: 'vase 花瓶', 76: 'scissors はさみ', 
    77: 'teddy bear テディベア', 78: 'hair drier ヘアドライヤー', 
    79: 'toothbrush 歯ブラシ'
}

# データセットを作成
train_dataset = YOLODataset(train_image_dir, train_label_dir, img_size=(640, 640), classes=classes)
test_dataset = YOLODataset(val_image_dir, val_label_dir, img_size=(640, 640), classes=classes)

# データローダーを作成
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=custom_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=custom_collate_fn)

# データ読み込みテスト
for batch_i, (imgs, targets) in enumerate(train_dataloader):
    print(f"\nBatch {batch_i}:")
    print(f"Images shape: {imgs.shape}")
    print(f"Number of targets: {len(targets)}")
    for i, target in enumerate(targets):
        print(f"  Target {i} shape: {target.shape}")
    print("\nDetailed information:")
    for i, (target, img_name) in enumerate(zip(targets, train_dataset.image_files)):
        print(f"\nImage {i}:")
        print(f"  File name: {os.path.basename(img_name)}")
        print(f"  Target shape: {target.shape}")
        if len(target) > 0:
            print(f"  Number of objects: {target.shape[0]}")
            print(f"  First object (class, x, y, w, h): {target[0].tolist()}")

In [ ]:
from matplotlib.font_manager import FontProperties
# 日本語フォントの設定
font_path = '../resource/SimHei.ttf'  # システムのフォントパスに応じて調整してください。repoファイル内にあるのでそのアドレスに変更できます
font_prop = FontProperties(fname=font_path)

def plot_image_with_boxes(image, labels, img_name=None, class_names=None, figsize=(10, 10)):
    """
    画像とバウンディングボックスを可視化

    Args:
        image: torch.Tensor (C, H, W) - 正規化後の画像
        labels: torch.Tensor (num_objects, 5) - [class, x_center, y_center, width, height]
        img_name: str - 画像ファイル名
        class_names: dict - クラスIDから名前へのマッピング
        figsize: tuple - 画像表示サイズ
    """
    # 画像フォーマットを変換
    img = image.permute(1, 2, 0).numpy()

    # 逆正規化
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img = std * img + mean
    img = np.clip(img, 0, 1)

    # 図形を作成
    fig, ax = plt.subplots(1, figsize=figsize)
    ax.imshow(img)

    # 画像サイズを取得
    height, width = img.shape[:2]

    # 異なるクラスの色を設定
    colors = plt.cm.hsv(np.linspace(0, 1, 81))[:, :3]

    # 各バウンディングボックスを描画
    for label in labels:
        class_id = int(label[0])
        x_center = float(label[1]) * width
        y_center = float(label[2]) * height
        w = float(label[3]) * width
        h = float(label[4]) * height

        # 左上角座標を計算
        x = x_center - w / 2
        y = y_center - h / 2

        # ラベル位置が画像範囲内にあるかチェック
        if x >= 0 and y >= 0 and x + w <= width and y + h <= height:
            # このクラスの色を取得
            color = colors[class_id % len(colors)]

            # 矩形を作成
            rect = patches.Rectangle(
                (x, y), w, h,
                linewidth=2,
                edgecolor=color,
                facecolor='none'
            )
            ax.add_patch(rect)

            class_name = class_names[class_id]

            # ラベル背景ボックスを描画
            plt.text(
                x, y - 10,  # ラベル位置を調整、物体を遮らないように
                class_name,
                bbox=dict(facecolor=color, alpha=0.8),
                color='white',
                fontsize=12,
                fontproperties=font_prop
            )

    # 画像タイトルを追加
    if img_name:
        plt.title(img_name, fontproperties=font_prop)

    plt.axis('off')
    plt.tight_layout()
    plt.show()

# 可視化をテスト
for batch_i, (imgs, targets) in enumerate(train_dataloader):
    for i, (img, target) in enumerate(zip(imgs, targets)):
        plot_image_with_boxes(img, target, img_name=f"Image_{batch_i}_{i}.jpg", class_names=classes, figsize=(8, 8))
    break  # 1つのバッチのみ可視化

![](./image/YOLOv5.png)

<img src="./image/ConBNSiLU.png" alt="ConBNSiLU模块" width="300" />

In [ ]:
import torch
import torch.nn as nn
import torch.nn.functional as F 

class ConvBNSiLU(nn.Module):
    # 図中c=output_size,k=kernel_size,s=stride,p=padding,後続コードでも同様
    def __init__(self, input_size, output_size, kernel_size, stride, padding):
        super(ConvBNSiLU, self).__init__()
        self.conv = nn.Conv2d(input_size,output_size, kernel_size, stride, padding=padding if padding is not None else kernel_size//2)
        self.bn = nn.BatchNorm2d(output_size)
        self.act = nn.SiLU()

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))
input_size = 3
output_size = 64
kernel_size = 6
stride = 2
padding = 2
# 入力テンソル、入力サイズを [batch_size, channels, height, width] と仮定
input_tensor = torch.randn(1, input_size, 640, 640)

convbnsilu = ConvBNSiLU(input_size, output_size, kernel_size, stride, padding)
output = convbnsilu(input_tensor)
print(output.shape)    

#出力次元は図中のP1出力と一致

<img src="./image/bottleneck1.png" alt="图片描述" width="300" />

In [ ]:
# BottleNeck1モジュール
class BottleNeck1(nn.Module):
    def __init__(self,input_size,output_size,kernel_size,stride,padding):
        super(BottleNeck1,self).__init__()
        self.conv1=ConvBNSiLU(input_size,output_size=input_size,kernel_size=1,stride=1,padding=0)
        self.conv2=ConvBNSiLU(input_size,output_size=input_size,kernel_size=3,stride=1,padding=1)
    def forward(self,x):
        x1=self.conv1(x)
        x1=self.conv2(x1)
        return torch.cat((x,x1),dim=1)

input_size = 3

# 入力テンソル、入力サイズを [batch_size, channels, height, width] と仮定
input_tensor = torch.randn(1, input_size, 640, 640)

bottleneck1 = BottleNeck1(input_size, input_size, kernel_size, stride, padding)
output = bottleneck1(input_tensor)
print(output.shape)    

#出力次元は図中の出力と一致    

<img src="./image/bottleneck2.png" alt="图片描述" width="300" />

In [ ]:
# BottleNeck2モジュール
class BottleNeck2(nn.Module):
    def __init__(self,input_size,output_size,kernel_size,stride,padding):
        super(BottleNeck2,self).__init__()
        self.conv1=ConvBNSiLU(input_size,output_size=input_size,kernel_size=1,stride=1,padding=0)
        self.conv2=ConvBNSiLU(input_size,output_size=input_size,kernel_size=3,stride=1,padding=1)
    def forward(self,x):
        x1=self.conv1(x)
        x1=self.conv2(x1)
        return x1

input_size = 3

# 入力テンソル、入力サイズを [batch_size, channels, height, width] と仮定
input_tensor = torch.randn(1, input_size, 640, 640)

bottleneck2 = BottleNeck2(input_size, input_size, kernel_size, stride, padding)
output = bottleneck2(input_tensor)
print(output.shape)    

#出力次元は図中の出力と一致    

<img src="./image/SPPF.png" alt="图片描述" width="600" />

In [6]:
#SPPF模块
class SPPF(nn.Module):
    def __init__(self, input_size, output_size, kernel_size=5, stride=1, padding=2):
        super(SPPF, self).__init__()
        self.conv1 = ConvBNSiLU(input_size, output_size=input_size // 2, kernel_size=1,stride=1,padding=0)
        self.maxpool1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=2)
        self.maxpool2 = nn.MaxPool2d(kernel_size=5, stride=1, padding=2)
        self.maxpool3 = nn.MaxPool2d(kernel_size=5, stride=1, padding=2)
        self.conv2 = ConvBNSiLU(2*input_size,output_size,kernel_size=1,stride=1,padding=0)
          
    def forward(self,x):
        x = self.conv1(x)
        x1 = self.maxpool1(x)
        x2 = self.maxpool2(x1)
        x3 = self.maxpool3(x2)
        # 拼接操作
        out = torch.cat([x, x1, x2, x3], dim=1)
        x4=self.conv2(out)
        return x4

# 测试代码
# 输入数据示例，假设输入为batch_size=1, 通道数为1024，尺寸为20x20的特征图
input_data = torch.randn(1, 1024,20,20)
output_size=1024
# 创建SPPF模块实例
sppf_module = SPPF(1024 ,output_size)
# 前向传播
output = sppf_module(input_data)
print(output.shape) 

# 输出符合图中预期

torch.Size([1, 1024, 20, 20])


In [ ]:
# アップサンプリングモジュール(詳細はYOLOv5詳細説明を参照)
import torch
import torch.nn as nn
class Upsample(nn.Module):
    def __init__(self, input_size, output_size, scale_factor=2):
        super(Upsample, self).__init__()
        # 畳み込み層でチャネル数を調整
        self.conv = nn.Conv2d(input_size, output_size, kernel_size=1, stride=1, padding=0)
        # アップサンプリング層
        self.upsample = nn.Upsample(scale_factor=scale_factor, mode='nearest')

    def forward(self, x):
        # まず畳み込み操作でチャネル数を調整
        x = self.conv(x)
        # その後アップサンプリング操作を実行
        x = self.upsample(x)
        return x


# テストコード

# 入力を batch_size=1, 入力チャネル数が 256, 特徴マップサイズが 40x40 と仮定
input_tensor = torch.randn(1, 256, 40, 40)
# アップサンプリングモジュールインスタンスを作成、入力チャネル数を256に変換、アップサンプリング倍数を2に設定
upsample = Upsample(input_size=256, output_size=256, scale_factor=2)
# 順伝播
output_tensor = upsample(input_tensor)
print(output_tensor.shape)

#出力アップサンプリングは図中の期待値と一致

# モジュール統合
各モジュールの設計完了後、ネットワークをより簡潔に見せるため、以下のコードでは図中の各C3モジュールを封装し、以下がそのモジュール封装命名説明です
## 命名説明
図に示されているように
![](./YOLOv5.png)
行列のような方法でC3モジュールを命名します

- 第1列には4つのC3モジュールがあり、上から下へそれぞれ：C11、C21、C31、C41と命名します
- C3モジュール内の3を省略し、読書時の誤解を避けます

In [ ]:
# C11モジュール、P1、P2、P3モジュールを含む
class C11(nn.Module):
    def __init__(self, input_size=3, output_size=64, kernel_size=6, stride=2, padding=2):
        super(C11, self).__init__()
        #P1モジュール
        self.P1=ConvBNSiLU(input_size=3,output_size=64,kernel_size=6,stride=2,padding=2)
        #P2モジュール
        self.P2=ConvBNSiLU(input_size=64,output_size=128,kernel_size=3,stride=2,padding=1)
        
        self.conv1 = ConvBNSiLU(input_size=128, output_size=64, kernel_size=1,stride=1,padding=0)
        
        self.bottle1=BottleNeck1(input_size=64,output_size=64,kernel_size=1,stride=1,padding=0)
        self.bottle2=BottleNeck1(input_size=128,output_size=128,kernel_size=1,stride=1,padding=0)
        self.bottle3=BottleNeck1(input_size=256,output_size=256,kernel_size=1,stride=1,padding=0)
        
        self.conv2 = ConvBNSiLU(input_size=128, output_size=64, kernel_size=1,stride=1,padding=0)
        self.conv3 = ConvBNSiLU(input_size=576,output_size=128,kernel_size=1,stride=1,padding=0)
        
        #図中のP3を追加
        
        self.P3=ConvBNSiLU(input_size=128,output_size=256,kernel_size=3,stride=2,padding=1)
        
    def forward(self,x):
        x=self.P1(x)
        x=self.P2(x)
        x1=self.conv1(x)
        x2=self.conv2(x)
        x1=self.bottle1(x1)
        x1=self.bottle2(x1)
        x1=self.bottle3(x1)
        #print(f'pass 3 BottleNeck blocks is:{x1.shape}')
        x3=torch.cat((x1,x2),dim=1)
        x3=self.conv3(x3)
        #print(f'C11 output is:{x3.shape}')
        x3=self.P3(x3)
        return x3

# テストコード

input_tensor = torch.randn(1, 3, 640, 640)

c11=C11(3,64,6,2,2)
# 順伝播
output_tensor = c11(input_tensor)
print(output_tensor.shape)

#出力アップサンプリングは図中の期待値と一致

In [9]:
#C21模块
class C21(nn.Module):
    def __init__(self, input_size=256, output_size=128, kernel_size=1, stride=1, padding=0):
        super(C21, self).__init__()
        self.conv1 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        
        self.bottle1=BottleNeck1(input_size=128,output_size=128,kernel_size=1,stride=1,padding=0)
        self.bottle2=BottleNeck1(2*output_size,2*output_size,kernel_size,stride,padding)
        self.bottle3=BottleNeck1(4*output_size,4*output_size,kernel_size,stride,padding)
        self.bottle4=BottleNeck1(8*output_size,8*output_size,kernel_size,stride,padding)
        self.bottle5=BottleNeck1(16*output_size,16*output_size,kernel_size,stride,padding)
        self.bottle6=BottleNeck1(32*output_size,32*output_size,kernel_size,stride,padding)
        
        self.conv2 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        self.conv3 = ConvBNSiLU(input_size=65*128,output_size=256,kernel_size=1,stride=1,padding=0)
        #65=32x2+1
        
        #加入P4模块
        self.P4=ConvBNSiLU(input_size,output_size=512,kernel_size=3,stride=2,padding=1)
        
    def forward(self,x):
        x1=self.conv1(x)
        x2=self.conv2(x)
        x1=self.bottle1(x1)
        #print(f'pass 1 BottleNeck blocks is:{x1.shape}')
        x1=self.bottle2(x1)
        #print(f'pass 2 BottleNeck blocks is:{x1.shape}')
        x1=self.bottle3(x1)
        #print(f'pass 3 BottleNeck blocks is:{x1.shape}')
        x1=self.bottle4(x1)
        #print(f'pass 4 BottleNeck blocks is:{x1.shape}')
        x1=self.bottle5(x1)
        #print(f'pass 5 BottleNeck blocks is:{x1.shape}')
        x1=self.bottle6(x1)
        #print(f'pass 6 BottleNeck blocks is:{x1.shape}')
        x3=torch.cat((x1,x2),dim=1)
        x3=self.conv3(x3)
        #print(f'C21 output is:{x3.shape}')
        x4=self.P4(x3)
        return x3,x4

# 测试代码

input_tensor = torch.randn(1, 256, 80, 80)

c21=C21(256,128,1,1,0)
# 前向传播
output1,output_tensor2 = c21(input_tensor)
print(f'final output: {output_tensor2.shape}')

#输出上采样符合图中预期

final output: torch.Size([1, 512, 40, 40])


In [ ]:
#C31モジュール 計算リソースが限られているため、ここではn=6の数量でデモを行います。計算リソースがある方はn=9を試してみてください
class C31(nn.Module):
    def __init__(self, input_size=512, output_size=256, kernel_size=1, stride=1, padding=0):
        super(C31, self).__init__()
        self.conv1 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        
        self.bottle1=BottleNeck1(input_size=256,output_size=256,kernel_size=1,stride=1,padding=0)
        self.bottle2=BottleNeck1(2*output_size,2*output_size,kernel_size,stride,padding)
        self.bottle3=BottleNeck1(4*output_size,4*output_size,kernel_size,stride,padding)
        self.bottle4=BottleNeck1(8*output_size,8*output_size,kernel_size,stride,padding)
        self.bottle5=BottleNeck1(16*output_size,16*output_size,kernel_size,stride,padding)
        self.bottle6=BottleNeck1(32*output_size,32*output_size,kernel_size,stride,padding)
        
        self.conv2 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        self.conv3 = ConvBNSiLU(input_size=65*256,output_size=512,kernel_size=1,stride=1,padding=0)
        #65=32x2+1
        
        #P5モジュールを追加
        self.P5=ConvBNSiLU(input_size,output_size=1024,kernel_size=3,stride=2,padding=1)
        
    def forward(self,x):
        x1=self.conv1(x)
        x2=self.conv2(x)
        x1=self.bottle1(x1)
        #print(f'pass 1 BottleNeck blocks is:{x1.shape}')
        x1=self.bottle2(x1)
        #print(f'pass 2 BottleNeck blocks is:{x1.shape}')
        x1=self.bottle3(x1)
        #print(f'pass 3 BottleNeck blocks is:{x1.shape}')
        x1=self.bottle4(x1)
        #print(f'pass 4 BottleNeck blocks is:{x1.shape}')
        x1=self.bottle5(x1)
        #print(f'pass 5 BottleNeck blocks is:{x1.shape}')
        x1=self.bottle6(x1)
        #print(f'pass 6 BottleNeck blocks is:{x1.shape}')
        x3=torch.cat((x1,x2),dim=1)
        x3=self.conv3(x3)
        #print(f'C21 output is:{x3.shape}')
        x4=self.P5(x3)
        return x3,x4

# テストコード

input_tensor = torch.randn(1, 512, 40, 40)

c31=C31(512,256,1,1,0)
# 順伝播
output1,output_tensor2 = c31(input_tensor)
print(f'final output: {output_tensor2.shape}')

#出力アップサンプリングは図中の期待値と一致

In [11]:
# C41模块,并且带有P3模块
class C41(nn.Module):
    def __init__(self, input_size=1024, output_size=512, kernel_size=1, stride=1, padding=0):
        super(C41, self).__init__()
        self.conv1 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        self.bottle1=BottleNeck1(input_size=512,output_size=512,kernel_size=1,stride=1,padding=0)
        self.bottle2=BottleNeck1(2*output_size,2*output_size,kernel_size,stride,padding)
        self.bottle3=BottleNeck1(4*output_size,4*output_size,kernel_size,stride,padding)
        
        self.conv2 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        self.conv3 = ConvBNSiLU(9*output_size,output_size=1024,kernel_size=1,stride=1,padding=0)
        
    def forward(self,x):
        x1=self.conv1(x)
        x2=self.conv2(x)
        x1=self.bottle1(x1)
        x1=self.bottle2(x1)
        x1=self.bottle3(x1)
        #print(f'pass 3 BottleNeck blocks is:{x1.shape}')
        x3=torch.cat((x1,x2),dim=1)
        x3=self.conv3(x3)
        return x3

# 测试代码

input_tensor = torch.randn(1, 1024, 20, 20)

c41=C41(1024,512,1,1,0)
# 前向传播
output_tensor = c41(input_tensor)
print(output_tensor.shape)

#输出上采样符合图中预期

torch.Size([1, 1024, 20, 20])


In [12]:
#模块C12
class C12(nn.Module):
    def __init__(self, input_size=512, output_size=256, kernel_size=1, stride=1, padding=0):
        super(C12, self).__init__()
        self.conv1 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        
        self.bottle1=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        self.bottle2=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        self.bottle3=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        
        self.conv2 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        self.conv3 = ConvBNSiLU(2*output_size,output_size=256,kernel_size=1,stride=1,padding=0)
        
    def forward(self,x):
        x1=self.conv1(x)
        x2=self.conv2(x)
        x1=self.bottle1(x1)
        x1=self.bottle2(x1)
        x1=self.bottle3(x1)
        #print(f'pass 3 BottleNeck blocks is:{x1.shape}')
        x3=torch.cat((x1,x2),dim=1)
        x3=self.conv3(x3)
        return x3

# 测试代码

input_tensor = torch.randn(1, 512, 80, 80)

c12=C12(512,256,1,1,0)
# 前向传播
output_tensor = c12(input_tensor)
print(output_tensor.shape)

#输出上采样符合图中预期

torch.Size([1, 256, 80, 80])


In [13]:
#模块C22
class C22(nn.Module):
    def __init__(self, input_size=1024, output_size=512, kernel_size=1, stride=1, padding=0):
        super(C22, self).__init__()
        self.conv1 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        
        self.bottle1=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        self.bottle2=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        self.bottle3=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        
        self.conv2 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        self.conv3 = ConvBNSiLU(2*output_size,output_size=512,kernel_size=1,stride=1,padding=0)
        
    def forward(self,x):
        x1=self.conv1(x)
        x2=self.conv2(x)
        x1=self.bottle1(x1)
        x1=self.bottle2(x1)
        x1=self.bottle3(x1)
        #print(f'pass 3 BottleNeck blocks is:{x1.shape}')
        x3=torch.cat((x1,x2),dim=1)
        x3=self.conv3(x3)
        return x3

# 测试代码

input_tensor = torch.randn(1, 1024, 40, 40)

c22=C22(1024,512,1,1,0)
# 前向传播
output_tensor = c22(input_tensor)
print(output_tensor.shape)

#输出上采样符合图中预期

torch.Size([1, 512, 40, 40])


In [14]:
#模块C13
class C13(nn.Module):
    def __init__(self, input_size=512, output_size=256, kernel_size=1, stride=1, padding=0):
        super(C13, self).__init__()
        self.conv1 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        
        self.bottle1=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        self.bottle2=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        self.bottle3=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        
        self.conv2 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        self.conv3 = ConvBNSiLU(input_size=512,output_size=512,kernel_size=1,stride=1,padding=0)
        
    def forward(self,x):
        x1=self.conv1(x)
        x2=self.conv2(x)
        x1=self.bottle1(x1)
        x1=self.bottle2(x1)
        x1=self.bottle3(x1)
        #print(f'pass 3 BottleNeck blocks is:{x1.shape}')
        x3=torch.cat((x1,x2),dim=1)
        x3=self.conv3(x3)
        return x3

# 测试代码

input_tensor = torch.randn(1, 512, 40, 40)

c13=C13(512,256,1,1,0)
# 前向传播
output_tensor = c13(input_tensor)
print(output_tensor.shape)

#输出上采样符合图中预期

torch.Size([1, 512, 40, 40])


In [15]:
#模块C23
class C23(nn.Module):
    def __init__(self, input_size=1024, output_size=512, kernel_size=1, stride=1, padding=0):
        super(C23, self).__init__()
        self.conv1 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        
        self.bottle1=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        self.bottle2=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        self.bottle3=BottleNeck2(output_size,output_size,kernel_size,stride,padding)
        
        self.conv2 = ConvBNSiLU(input_size, output_size, kernel_size,stride,padding)
        self.conv3 = ConvBNSiLU(input_size=1024,output_size=1024,kernel_size=1,stride=1,padding=0)
        
    def forward(self,x):
        x1=self.conv1(x)
        x2=self.conv2(x)
        x1=self.bottle1(x1)
        x1=self.bottle2(x1)
        x1=self.bottle3(x1)
        #print(f'pass 3 BottleNeck blocks is:{x1.shape}')
        x3=torch.cat((x1,x2),dim=1)
        x3=self.conv3(x3)
        return x3

# 测试代码

input_tensor = torch.randn(1, 1024, 20, 20)

c23=C23(1024,512,1,1,0)
# 前向传播
output_tensor = c23(input_tensor)
print(output_tensor.shape)

#输出上采样符合图中预期

torch.Size([1, 1024, 20, 20])


In [ ]:
import torch
import torch.nn as nn


class YOLOv5Config:
    def __init__(self):
        self.num_classes = 80  # クラス数
        self.anchor_sizes = [[(10, 13), (16, 30), (33, 23)],  # 異なるスケールのアンカーボックスサイズ
                          [(30, 61), (62, 45), (59, 119)],
                          [(116, 90), (156, 198), (373, 326)]]
        self.strides = [32, 16, 8]  # 異なるスケール特徴マップのストライド


# 畳み込みブロックを定義、畳み込み、バッチ正規化、LeakyReLU活性化関数を含む
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.activation = nn.LeakyReLU(0.1)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.activation(x)
        return x


# ヘッドネットワーク予測モジュールを定義
class YOLOv5Head(nn.Module):
    def __init__(self, config):
        super(YOLOv5Head, self).__init__()
        self.config = config
        self.num_anchors_per_scale = len(config.anchor_sizes[0])
        self.num_classes = config.num_classes

        # 異なるスケールの予測層を構築
        self.conv2d1 = ConvBlock(1024, 1024 * 2, 3, 1, 1)
        self.conv2d2 = ConvBlock(1024 * 2, self.num_anchors_per_scale * (5 + self.num_classes), 1)
        self.conv2d3 = ConvBlock(512, 512 * 2, 3, 1, 1)
        self.conv2d4 = ConvBlock(512 * 2, self.num_anchors_per_scale * (5 + self.num_classes), 1)
        self.conv2d5 = ConvBlock(256, 256 * 2, 3, 1, 1)
        self.conv2d6 = ConvBlock(256 * 2, self.num_anchors_per_scale * (5 + self.num_classes), 1)

    def forward(self, features):
        """
        :param features: バックボーンネットワーク出力の異なるスケール特徴マップリスト、通常は3つのスケール
        :return: 予測結果リスト、各要素は1つのスケールの予測に対応、形状は [batch_size, num_anchors * (5 + num_classes), grid_size, grid_size]
        """
        predictions = []
        # 第1のスケール
        prediction = self.conv2d1(features[0])
        prediction = self.conv2d2(prediction)
        batch_size, _, grid_size1, _ = prediction.size()
        # prediction = prediction.view(batch_size, self.num_anchors_per_scale, 5 + self.num_classes, grid_size1, grid_size1)
        # prediction = prediction.permute(0, 1, 3, 4, 2)
        prediction = prediction.view(batch_size, int(self.num_anchors_per_scale* (5 + self.num_classes)), grid_size1, grid_size1)
        predictions.append(prediction)

        # 第2のスケール
        prediction = self.conv2d3(features[1])
        prediction = self.conv2d4(prediction)
        batch_size, _, grid_size2, _ = prediction.size()
        # prediction = prediction.view(batch_size, self.num_anchors_per_scale, 5 + self.num_classes, grid_size2, grid_size2)
        # prediction = prediction.permute(0, 1, 3, 4, 2)
        prediction = prediction.view(batch_size, int(self.num_anchors_per_scale* (5 + self.num_classes)), grid_size2, grid_size2)
        predictions.append(prediction)

        # 第3のスケール
        prediction = self.conv2d5(features[2])
        prediction = self.conv2d6(prediction)
        batch_size, _, grid_size3, _ = prediction.size()
        # prediction = prediction.view(batch_size, self.num_anchors_per_scale, 5 + self.num_classes, grid_size3, grid_size3)
        # prediction = prediction.permute(0, 1, 3, 4, 2)
        prediction = prediction.view(batch_size, int(self.num_anchors_per_scale* (5 + self.num_classes)), grid_size3, grid_size3)
   
        predictions.append(prediction)

        return predictions


# 使用例
if __name__ == "__main__":
    # 設定を初期化
    config = YOLOv5Config()
    # 入力特徴マップを3つの異なるスケールと仮定、ここで形状を簡単にシミュレート
    features = [torch.randn(1, 1024, 20, 20), torch.randn(1, 512, 40, 40), torch.randn(1, 256, 80, 80)]
    model = YOLOv5Head(config)
    predictions = model(features)
    for i, prediction in enumerate(predictions):
        print(f"Scale {i + 1} prediction shape: {prediction.shape}")

In [17]:
#头部网络
class Conv(nn.Module):
    def __init__(self, input_size, output_size, kernel_size, stride, padding):
        super(Conv,self).__init__()
        self.conv=nn.Conv2d(input_size, output_size, kernel_size, stride, padding)
    def forward(self,x):
        return self.conv(x)

In [ ]:
#次にモデルの構築を開始

class YOLOv5(nn.Module):
    def __init__(self,input_size,output_size,kernel_size,stride,padding):
        super(YOLOv5,self).__init__()
        self.c11=C11(input_size=3, output_size=64, kernel_size=6, stride=2, padding=2)
        self.c21=C21(input_size=256, output_size=128, kernel_size=1, stride=1, padding=0)
        
        self.c31=C31(input_size=512, output_size=256, kernel_size=1, stride=1, padding=0)
        
        self.c41=C41(input_size=1024, output_size=512, kernel_size=1, stride=1, padding=0)
        
        self.SPPF=SPPF(input_size=1024, output_size=1024, kernel_size=5, stride=1, padding=2)
        
        #図中第2列、記述順序は下から上、ConvBNSiLUモジュール命名規則は下から上で1から始まる
        self.conv1=ConvBNSiLU(input_size=1024, output_size=512, kernel_size=1, stride=1, padding=0)
        
        self.up1=Upsample(input_size=512, output_size=512, scale_factor=2)
        
        self.c22=C22(input_size=1024, output_size=512, kernel_size=1, stride=1, padding=0)
        self.conv2=ConvBNSiLU(input_size=512, output_size=256, kernel_size=1, stride=1, padding=0)
        
        self.up2=Upsample(input_size=256, output_size=256, scale_factor=2)
        
        self.c12=C12(input_size=512, output_size=256, kernel_size=1, stride=1, padding=0)

        #図中第3列
        self.conv3=ConvBNSiLU(input_size=256, output_size=256, kernel_size=3, stride=2, padding=1)

        self.c13=C13(input_size=512, output_size=256, kernel_size=1, stride=1, padding=0)

        self.conv4=ConvBNSiLU(input_size=512, output_size=512, kernel_size=3, stride=2, padding=1)

        self.c23=C23(input_size=1024, output_size=512, kernel_size=1, stride=1, padding=0)
        
        #最終の畳み込みカーネル、n_clsはクラス数で、5+の後に直接クラス数を記述可能、サンプルコードのクラス数は80
        #チャネル出力式：c=（5+n_cls）x3、ここで出力は（5+80）*3=255、上から下へそれぞれ1、2、3
        self.Conv1=Conv(input_size=256, output_size=255, kernel_size=1, stride=1, padding=0)
        
        self.Conv2=Conv(input_size=512, output_size=255, kernel_size=1, stride=1, padding=0)
        
        self.Conv3=Conv(input_size=1024, output_size=255, kernel_size=1, stride=1, padding=0)
    #テストコード    
    def forward(self,x):
        x=self.c11(x)
        output1,x=self.c21(x)
        output2,x=self.c31(x)
        x=self.c41(x)
        x=self.SPPF(x)
        
        x1=self.conv1(x)#分岐出力があり、マークする
        
        x=self.up1(x1)
        x=torch.cat((x,output2),dim=1)
        x=self.c22(x)
        x2=self.conv2(x)#分岐出力があり、マークする
        x=self.up2(x2)
        x=torch.cat((x,output1),dim=1)
        final_output1=self.c12(x)#分岐出力があり、マークする
        
        x=self.conv3(final_output1)
        
        x=torch.cat((x,x2),dim=1)
        final_output2=self.c13(x)
        
        x=self.conv4(final_output2)
        x=torch.cat((x,x1),dim=1)
        
        final_output3=self.c23(x)
        
        final_output1=self.Conv1(final_output1)
        final_output2=self.Conv2(final_output2)
        final_output3=self.Conv3(final_output3)
        
        return final_output1,final_output2,final_output3


# テストコード
input_tensor = torch.randn(1, 3, 640, 640)
# 順伝播
yolov5=YOLOv5(3,64,6,2,2)
output_1,output_2,output_3 = yolov5(input_tensor)
print(output_1.shape)
print(output_2.shape)
print(output_3.shape)
#出力アップサンプリングは図中の期待値と一致 
print(output_1)
print(output_2)
print(output_3)

In [19]:
import torch
import torch.nn as nn

class FeaturePyramidFusion(nn.Module):
    def __init__(self, in_channels_list, out_channels):
        """
        初始化特征金字塔融合模块。

        参数:
        in_channels_list (list): 输入特征图的通道数列表，列表长度表示有多少个不同尺度的特征图输入。
        out_channels (int): 输出特征图的通道数，所有输出特征图的通道数将统一为该值。
        """
        super(FeaturePyramidFusion, self).__init__()
        self.num_features = len(in_channels_list)  # 输入特征图的数量
        # 为每个输入特征图创建一个1x1卷积层，用于调整通道数
        self.conv1x1_modules = nn.ModuleList([
            nn.Conv2d(in_channels, out_channels, kernel_size=1)
            for in_channels in in_channels_list
        ])
        # 用于上采样的最近邻插值
        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
        # 用于融合后的3x3卷积层
        self.conv3x3_modules = nn.ModuleList([
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
            for _ in range(self.num_features)
        ])

    def forward(self, feature_maps):
        """
        前向传播函数，实现特征金字塔的融合过程。

        参数:
        feature_maps (list): 输入的不同尺度的特征图列表，列表中的元素顺序应与初始化时的in_channels_list对应。

        返回:
        list: 融合后的特征图列表，与输入特征图列表长度相同。
        """
        # 检查输入特征图的数量和通道数是否与初始化时一致
        if len(feature_maps) != self.num_features:
            raise ValueError(f"Expected {self.num_features} feature maps, but got {len(feature_maps)}")
        for i, feature_map in enumerate(feature_maps):
            if feature_map.shape[1] != list(self.conv1x1_modules[i].parameters())[0].shape[1]:
                raise ValueError(f"Expected input feature map {i} to have {list(self.conv1x1_modules[i].parameters())[0].shape[1]} channels, but got {feature_map.shape[1]}")

        # 首先将输入特征图的通道数调整为统一的out_channels
        adjusted_feature_maps = []
        for i in range(self.num_features):
            adjusted_feature_maps.append(self.conv1x1_modules[i](feature_maps[i]))

        # 从最高分辨率的特征图开始，依次进行上采样和融合
        fused_feature_maps = []
        for i in range(self.num_features - 1, -1, -1):
            if i == self.num_features - 1:
                # 最高分辨率的特征图直接使用调整后的特征图
                x = adjusted_feature_maps[i]
            else:
                # 上采样上一层的融合结果
                upsampled_features = self.upsample(fused_feature_maps[-1])
                # 与当前层的调整后特征图进行元素相加
                x = adjusted_feature_maps[i] + upsampled_features
            # 通过3x3卷积进一步融合特征
            x = self.conv3x3_modules[i](x)
            fused_feature_maps.append(x)

        # 反转融合特征图列表，使其与输入特征图列表的尺度顺序一致
        return fused_feature_maps[::-1]

# 示例输入，这里需要确保output_1, output_2, output_3的通道数与in_channels_list对应

feature_maps = [output_1, output_2, output_3]
in_channels_list = [255, 255, 255]
out_channels = 255

#创建特征金字塔融合模块
fpn = FeaturePyramidFusion(in_channels_list, out_channels)
#进行特征融合
fused_features = fpn(feature_maps)

#打印融合后特征图的形状
for i, feature in enumerate(fused_features):
    print(f"Fused feature map {i + 1} shape: {feature.shape}")
    print(f"Fused feature map {i + 1} tensor: {feature}")

Fused feature map 1 shape: torch.Size([1, 255, 80, 80])
Fused feature map 1 tensor: tensor([[[[-0.0309, -0.0648, -0.0771,  ..., -0.0580, -0.1567, -0.1000],
          [-0.1346, -0.1231, -0.2336,  ..., -0.0989,  0.0141, -0.1184],
          [-0.0729, -0.0732, -0.0919,  ...,  0.0997, -0.0691, -0.0659],
          ...,
          [-0.0431, -0.1052, -0.0916,  ..., -0.0715, -0.0924, -0.0861],
          [-0.0068, -0.1175, -0.0900,  ..., -0.0375, -0.1245,  0.0005],
          [-0.0206, -0.0479, -0.0649,  ..., -0.0866, -0.1584, -0.1003]],

         [[ 0.0088, -0.0038,  0.0220,  ...,  0.1175, -0.0582,  0.0062],
          [ 0.0516, -0.0150,  0.0134,  ..., -0.0050,  0.1025, -0.0260],
          [-0.0165,  0.0262,  0.0409,  ..., -0.0104, -0.1004, -0.0141],
          ...,
          [ 0.0305,  0.0322, -0.0217,  ...,  0.0755, -0.0186, -0.0485],
          [ 0.0181, -0.0721, -0.0223,  ..., -0.0173, -0.0760,  0.0109],
          [-0.0059, -0.0110,  0.0185,  ..., -0.0223, -0.0538, -0.0657]],

         [[ 0.0265

In [20]:
model=YOLOv5(3,64,6,2,2)
print(model)

YOLOv5(
  (c11): C11(
    (P1): ConvBNSiLU(
      (conv): Conv2d(3, 64, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (P2): ConvBNSiLU(
      (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (conv1): ConvBNSiLU(
      (conv): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1))
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (bottle1): BottleNeck1(
      (conv1): ConvBNSiLU(
        (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
        (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): SiLU()
      )
      (conv2): ConvBNSiLU(
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride

In [ ]:
# 以下は未完成です、readmeドキュメントを参照してください
import torch
import torch.nn as nn
import torch.nn.functional as F

# YOLOv5の損失関数クラスを定義
class YOLOv5Loss(nn.Module):
    def __init__(self, num_classes, lambda_coord=5, lambda_noobj=0.5):
        super(YOLOv5Loss, self).__init__()
        self.num_classes = num_classes
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj
        self.num_anchors = 3  # 各位置のアンカーボックス数

    def forward(self, output1, output2, output3, targets1, targets2, targets3):
        # 損失を初期化
        loss = 0
        
        # 第1の出力を処理
        # 各アンカーボックスの予測を処理するため次元を調整
        obj_preds1 = output1[..., 4::(5 + self.num_classes)].unsqueeze(-1)  # ターゲット信頼度予測
        class_preds1 = output1[..., 5:(5 + self.num_classes)]  # クラス予測
        box_preds1 = output1[..., :4]  # バウンディングボックス予測

        # 各アンカーボックスの真値を処理するため次元を調整
        obj_targets1 = targets1[..., 4::(5 + self.num_classes)].unsqueeze(-1)  # ターゲット信頼度真値
        class_targets1 = targets1[..., 5:(5 + self.num_classes)]  # クラス真値
        box_targets1 = targets1[..., :4]  # バウンディングボックス真値

        # ターゲット信頼度損失を計算（二項交差エントロピー損失を使用）
        obj_loss1 = F.binary_cross_entropy_with_logits(obj_preds1, obj_targets1, reduction='none')

        # 分類損失を計算（交差エントロピー損失を使用）
        ## class_loss3, class_loss2 もpermuteオペレーションを削除、Hong
        # class_loss1 = F.cross_entropy(class_preds1.permute(0, 2, 1), class_targets1.permute(0, 2, 1), reduction='none')
        class_loss1 = F.cross_entropy(class_preds1, class_targets1, reduction='none')

        # バウンディングボックス損失を計算（平均二乗誤差損失を使用）
        # ターゲットがある位置について、バウンディングボックスの損失を計算
        coord_mask1 = obj_targets1 > 0
        box_loss1 = F.mse_loss(box_preds1[coord_mask1], box_targets1[coord_mask1], reduction='none')

        # 第1の出力の総損失を計算
        loss1 = (obj_loss1 * (obj_targets1 > 0).float() * (class_loss1 + self.lambda_coord * box_loss1) +
                 self.lambda_noobj * obj_loss1 * (obj_targets1 == 0).float()).sum()

        # 第2の出力を処理（第1の出力と同様の処理）
        obj_preds2 = output2[..., 4::(5 + self.num_classes)].unsqueeze(-1)
        class_preds2 = output2[..., 5:(5 + self.num_classes)]
        box_preds2 = output2[..., :4]

        obj_targets2 = targets2[..., 4::(5 + self.num_classes)].unsqueeze(-1)
        class_targets2 = targets2[..., 5:(5 + self.num_classes)]
        box_targets2 = targets2[..., :4]

        obj_loss2 = F.binary_cross_entropy_with_logits(obj_preds2, obj_targets2, reduction='none')
        class_loss2 = F.cross_entropy(class_preds2, class_targets2, reduction='none')
        coord_mask2 = obj_targets2 > 0
        box_loss2 = F.mse_loss(box_preds2[coord_mask2], box_targets2[coord_mask2], reduction='none')

        loss2 = (obj_loss2 * (obj_targets2 > 0).float() * (class_loss2 + self.lambda_coord * box_loss2) +
                 self.lambda_noobj * obj_loss2 * (obj_targets2 == 0).float()).sum()

        # 第3の出力を処理（第1の出力と同様の処理）
        obj_preds3 = output3[..., 4::(5 + self.num_classes)].unsqueeze(-1)
        class_preds3 = output3[..., 5:(5 + self.num_classes)]
        box_preds3 = output3[..., :4]

        obj_targets3 = targets3[..., 4::(5 + self.num_classes)].unsqueeze(-1)
        class_targets3 = targets3[..., 5:(5 + self.num_classes)]
        box_targets3 = targets3[..., :4]

        obj_loss3 = F.binary_cross_entropy_with_logits(obj_preds3, obj_targets3, reduction='none')
        class_loss3 = F.cross_entropy(class_preds3, class_targets3, reduction='none')
        coord_mask3 = obj_targets3 > 0
        box_loss3 = F.mse_loss(box_preds3[coord_mask3], box_targets3[coord_mask3], reduction='none')

        loss3 = (obj_loss3 * (obj_targets3 > 0).float() * (class_loss3 + self.lambda_coord * box_loss3) +
                 self.lambda_noobj * obj_loss3 * (obj_targets3 == 0).float()).sum()

        # 総損失は3つの出力の損失の合計
        loss = loss1 + loss2 + loss3

        return loss

In [ ]:
import torch
import torch.nn as nn
import torch.optim as optim


def train(model, train_loader, loss_fn, optimizer, device, save_path='yolov5_model.pt', num_epochs=8):
    model.train()  # モデルを訓練モードに設定
    for epoch in range(num_epochs):
        total_loss = 0.0
        num_batches = 0
        num_classes = 80
        for batch_idx, (images, labels) in enumerate(train_loader):
            images = images.to(device)

            # モデル順伝播
            output1, output2, output3 = model(images)

            batch_loss = 0
            for i in range(len(labels)):
                label = labels[i].to(device)
                num_objects = label.size(0)
                num_anchors = 3
                num_classes = loss_fn.num_classes

                # 損失関数の期待入力に合わせてラベル形状を調整
                target1 = torch.zeros((1, num_anchors, num_classes + 5, 80, 80)).to(device)
                target1[..., 5:(5 + num_classes), :num_objects, :num_objects] = torch.ones((num_classes, num_objects, num_objects))
                target1 = target1.view((1, int(num_anchors*(num_classes + 5)), 80, 80))

                target2 = torch.zeros((1, num_anchors, num_classes + 5, 40, 40)).to(device)
                target2[..., 5:(5 + num_classes), :num_objects, :num_objects] = torch.ones((num_classes, num_objects, num_objects))
                target2 = target2.view((1, int(num_anchors*(num_classes + 5)), 40, 40))

                target3 = torch.zeros((1, num_anchors, num_classes + 5, 20, 20)).to(device)
                target3[..., 5:(5 + num_classes), :num_objects, :num_objects] = torch.ones((num_classes, num_objects, num_objects))
                target3 = target3.view((1, int(num_anchors*(num_classes + 5)), 20, 20))

                # 損失を計算
                loss = loss_fn(output1[i].unsqueeze(0), output2[i].unsqueeze(0), output3[i].unsqueeze(0),
                             target1, target2, target3)
                batch_loss += loss

            optimizer.zero_grad()  # 勾配をゼロクリア
            batch_loss.backward()  # 逆伝播
            optimizer.step()  # オプティマイザー更新

            total_loss += batch_loss.item()
            num_batches += 1

            if (batch_idx + 1) % 10 == 0:
                avg_loss = total_loss / num_batches
                print(f"Epoch {epoch + 1}, Batch {batch_idx + 1}: Average Loss = {avg_loss}")
                total_loss = 0.0
                num_batches = 0

    # モデルを保存
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_classes = 80
    # モデルを初期化
    model = YOLOv5(3, 64, 6, 2, 2)
    model.to(device)
    # 損失関数を定義
    loss_fn = YOLOv5Loss(num_classes=80,lambda_coord=5, lambda_noobj=0.5)
    # オプティマイザーを定義
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    # データセットとデータローダーを作成
    train_dataset = YOLODataset(train_image_dir, train_label_dir, img_size=(640, 640), classes=classes)
    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=custom_collate_fn)
    # 訓練関数を呼び出し
    train(model, train_dataloader, loss_fn, optimizer, device)


if __name__ == "__main__":
    main()

In [23]:
# import torch
# import torch.nn as nn
# import torch.optim as optim


# def test(model, test_loader, loss_fn, device):
#     model.eval()  # 模型设置为评估模式
#     total_loss = 0.0
#     num_batches = 0
#     num_classes = 80
#     with torch.no_grad():  # 不计算梯度
#         for batch_idx, (images, labels) in enumerate(test_loader):
#             images = images.to(device)

#             # 模型前向传播
#             output1, output2, output3 = model(images)

#             batch_loss = 0
#             for i in range(len(labels)):
#                 label = labels[i].to(device)
#                 num_objects = label.size(0)
#                 num_anchors = 3
#                 num_classes = loss_fn.num_classes

#                 # 调整标签形状以匹配损失函数的期望输入
#                 target1 = torch.zeros((1, num_anchors, num_classes + 5, 80, 80)).to(device)
#                 target1[..., 5:(5 + num_classes), :num_objects, :num_objects] = torch.ones((num_classes, num_objects, num_objects))

#                 target2 = torch.zeros((1, num_anchors, num_classes + 5, 40, 40)).to(device)
#                 target2[..., 5:(5 + num_classes), :num_objects, :num_objects] = torch.ones((num_classes, num_objects, num_objects))

#                 target3 = torch.zeros((1, num_anchors, num_classes + 5, 20, 20)).to(device)
#                 target3[..., 5:(5 + num_classes), :num_objects, :num_objects] = torch.ones((num_classes, num_objects, num_objects))

#                 # 计算损失
#                 loss = loss_fn(output1[i].unsqueeze(0), output2[i].unsqueeze(0), output3[i].unsqueeze(0),
#                              target1, target2, target3)
#                 batch_loss += loss

#             total_loss += batch_loss.item()
#             num_batches += 1

#             if (batch_idx + 1) % 10 == 0:
#                 avg_loss = total_loss / num_batches
#                 print(f"Test Batch {batch_idx + 1}: Average Loss = {avg_loss}")
#                 total_loss = 0.0
#                 num_batches = 0


# def main():
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     num_classes = 80
#     # 初始化模型
#     model = YOLOv5(3, 64, 6, 2, 2)
#     model.to(device)
#     # 定义损失函数
#     loss_fn = YOLOv5Loss(num_classes=80)

#     # 加载保存的模型
#     model.load_state_dict(torch.load('yolov5_model.pt', map_location=device))
#     model.eval()

#     # 定义优化器
#     optimizer = optim.Adam(model.parameters(), lr=1e-3)

#     # 调用测试函数
#     test(model, test_dataloader, loss_fn, device)


# if __name__ == "__main__":
#     main()