In [1]:
import json
from PIL import Image
import torch
from torch.utils.data import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CustomCocoDataset(Dataset):
    def __init__(self, images_dir, annotations_file, processor):
        self.images_dir = images_dir
        self.processor = processor

        # Загружаем аннотации из COCO JSON
        with open(annotations_file, 'r') as f:
            coco = json.load(f)

        # Строим словарь: image_id -> информация об изображении
        self.image_id_to_info = {img['id']: img for img in coco['images']}

        # Группируем аннотации по image_id
        self.annotations_per_image = {}
        for ann in coco['annotations']:
            img_id = ann['image_id']
            if img_id not in self.annotations_per_image:
                self.annotations_per_image[img_id] = []
            self.annotations_per_image[img_id].append(ann)

        # Создаём список всех image_id для итерирования
        self.image_ids = list(self.image_id_to_info.keys())

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        img_info = self.image_id_to_info[image_id]

        # Загружаем изображение
        img_path = f"{self.images_dir}/{img_info['file_name']}"
        image = Image.open(img_path).convert("RGB")

        # Получаем аннотации
        anns = self.annotations_per_image.get(image_id, [])
        annotations = []
        for ann in anns:
            bbox = ann["bbox"]  # [x, y, width, height]
            area = ann.get("area", bbox[2] * bbox[3])  # если нет 'area', считаем сами
            iscrowd = ann.get("iscrowd", 0)
            annotations.append({
                "bbox": bbox,
                "category_id": ann["category_id"],
                "area": area,
                "iscrowd": iscrowd,
            })

        annotation_dict = {
            "image_id": image_id,
            "annotations": annotations
        }

        encoding = self.processor(images=image, annotations=annotation_dict, return_tensors="pt")

        pixel_values = encoding["pixel_values"].squeeze(0).to(device)
        
        # labels = encoding["labels"][0]
        labels = {
            k: v.to(device) if isinstance(v, torch.Tensor) else v
            for k, v in encoding["labels"][0].items()
        }

        # pixel_values = encoding["pixel_values"].squeeze(0)
        # labels = encoding["labels"][0]  

        return {
            "pixel_values": pixel_values,
            "labels": labels,
            # "pixel_values": encoding["pixel_values"],  # оставляем [1, 3, H, W]
            # "labels": encoding["labels"]
        }

In [2]:
# --- 2. collate_fn для DataLoader и Trainer ---

def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = [item["labels"] for item in batch]  # список словарей с разным количеством объектов
    return {"pixel_values": pixel_values, "labels": labels}

In [3]:
from transformers import Trainer, TrainingArguments




первый раз было

In [4]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="C:/Users/odara/Downloads/data/rtdetr_fine_tuning",
    num_train_epochs=8,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=2e-5,
    weight_decay=0.01,  # Регуляризация

    dataloader_num_workers=0,

    save_strategy="steps",
    save_steps=10000,
    save_total_limit=1,
    logging_dir="C:/Users/odara/Downloads/data/rtdetr_fine_tuning/logs",
    logging_strategy="steps",
    logging_steps=100,
    disable_tqdm=False,

    eval_strategy="steps",
    eval_steps=10000,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,

    # fp16=True,

    # Дополнительные параметры для стабильности и производительности
    gradient_accumulation_steps=1,
    dataloader_pin_memory=False
)

дообучим еще на 2 эпохи

In [4]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="C:/Users/odara/Downloads/data/rtdetr_fine_tuning",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=2e-5,
    weight_decay=0.01,  # Регуляризация

    dataloader_num_workers=0,

    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="C:/Users/odara/Downloads/data/rtdetr_fine_tuning/logs",
    logging_strategy="steps",
    logging_steps=100,
    disable_tqdm=False,

    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,

    # fp16=True,

    # Дополнительные параметры для стабильности и производительности
    gradient_accumulation_steps=1,
    dataloader_pin_memory=False
)

In [5]:
from transformers import RTDetrImageProcessor, RTDetrForObjectDetection

processor = RTDetrImageProcessor.from_pretrained(
    "PekingU/rtdetr_r101vd_coco_o365",
    size={"height": 512, "width": 512}
)
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r101vd_coco_o365")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RTDetrForObjectDetection(
  (model): RTDetrModel(
    (backbone): RTDetrConvEncoder(
      (model): RTDetrResNetBackbone(
        (embedder): RTDetrResNetEmbeddings(
          (embedder): Sequential(
            (0): RTDetrResNetConvLayer(
              (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
              (normalization): RTDetrFrozenBatchNorm2d()
              (activation): ReLU()
            )
            (1): RTDetrResNetConvLayer(
              (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (normalization): RTDetrFrozenBatchNorm2d()
              (activation): ReLU()
            )
            (2): RTDetrResNetConvLayer(
              (convolution): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (normalization): RTDetrFrozenBatchNorm2d()
              (activation): ReLU()
            )
          )
          (pooler): MaxPool2d(

In [6]:
train_dataset = CustomCocoDataset(
    images_dir=r"C:\Users\odara\Downloads\data\train_new\images",
    annotations_file=r"C:\Users\odara\Downloads\data\rtdetr_labels\instances_train.json",
    processor=processor
)

val_dataset = CustomCocoDataset(
    images_dir=r"C:\Users\odara\Downloads\data\val\images",
    annotations_file=r"C:\Users\odara\Downloads\data\rtdetr_labels\instances_val.json",
    processor=processor
)

In [7]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    processing_class=processor
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
10000,13.2184,11.252387
20000,12.9641,10.821891
30000,12.4773,10.735963
40000,12.4756,10.736838
50000,12.3814,10.687516
60000,12.0242,10.510774
70000,12.0136,10.707314


In [None]:
trainer.train(resume_from_checkpoint="C:/Users/odara/Downloads/data/rtdetr_fine_tuning/checkpoint-70000")

There were missing keys in the checkpoint model loaded: ['class_embed.0.weight', 'class_embed.0.bias', 'class_embed.1.weight', 'class_embed.1.bias', 'class_embed.2.weight', 'class_embed.2.bias', 'class_embed.3.weight', 'class_embed.3.bias', 'class_embed.4.weight', 'class_embed.4.bias', 'class_embed.5.weight', 'class_embed.5.bias', 'bbox_embed.0.layers.0.weight', 'bbox_embed.0.layers.0.bias', 'bbox_embed.0.layers.1.weight', 'bbox_embed.0.layers.1.bias', 'bbox_embed.0.layers.2.weight', 'bbox_embed.0.layers.2.bias', 'bbox_embed.1.layers.0.weight', 'bbox_embed.1.layers.0.bias', 'bbox_embed.1.layers.1.weight', 'bbox_embed.1.layers.1.bias', 'bbox_embed.1.layers.2.weight', 'bbox_embed.1.layers.2.bias', 'bbox_embed.2.layers.0.weight', 'bbox_embed.2.layers.0.bias', 'bbox_embed.2.layers.1.weight', 'bbox_embed.2.layers.1.bias', 'bbox_embed.2.layers.2.weight', 'bbox_embed.2.layers.2.bias', 'bbox_embed.3.layers.0.weight', 'bbox_embed.3.layers.0.bias', 'bbox_embed.3.layers.1.weight', 'bbox_embed.3.l

Step,Training Loss,Validation Loss
80000,11.6397,10.533495
90000,11.6942,10.366463
100000,11.7139,10.330491
110000,11.7388,10.34814
120000,11.4264,10.360243
130000,11.2234,10.289572


In [8]:
trainer.train(resume_from_checkpoint="C:/Users/odara/Downloads/data/rtdetr_fine_tuning/checkpoint-130000")

There were missing keys in the checkpoint model loaded: ['class_embed.0.weight', 'class_embed.0.bias', 'class_embed.1.weight', 'class_embed.1.bias', 'class_embed.2.weight', 'class_embed.2.bias', 'class_embed.3.weight', 'class_embed.3.bias', 'class_embed.4.weight', 'class_embed.4.bias', 'class_embed.5.weight', 'class_embed.5.bias', 'bbox_embed.0.layers.0.weight', 'bbox_embed.0.layers.0.bias', 'bbox_embed.0.layers.1.weight', 'bbox_embed.0.layers.1.bias', 'bbox_embed.0.layers.2.weight', 'bbox_embed.0.layers.2.bias', 'bbox_embed.1.layers.0.weight', 'bbox_embed.1.layers.0.bias', 'bbox_embed.1.layers.1.weight', 'bbox_embed.1.layers.1.bias', 'bbox_embed.1.layers.2.weight', 'bbox_embed.1.layers.2.bias', 'bbox_embed.2.layers.0.weight', 'bbox_embed.2.layers.0.bias', 'bbox_embed.2.layers.1.weight', 'bbox_embed.2.layers.1.bias', 'bbox_embed.2.layers.2.weight', 'bbox_embed.2.layers.2.bias', 'bbox_embed.3.layers.0.weight', 'bbox_embed.3.layers.0.bias', 'bbox_embed.3.layers.1.weight', 'bbox_embed.3.l

Step,Training Loss,Validation Loss
140000,11.3084,10.220606
150000,11.2536,10.339017
160000,11.2221,10.220385
170000,11.2854,10.128933
180000,10.9458,10.190219
190000,10.8249,10.263694
200000,10.8831,10.214276
210000,11.145,10.118624


There were missing keys in the checkpoint model loaded: ['class_embed.0.weight', 'class_embed.0.bias', 'class_embed.1.weight', 'class_embed.1.bias', 'class_embed.2.weight', 'class_embed.2.bias', 'class_embed.3.weight', 'class_embed.3.bias', 'class_embed.4.weight', 'class_embed.4.bias', 'class_embed.5.weight', 'class_embed.5.bias', 'bbox_embed.0.layers.0.weight', 'bbox_embed.0.layers.0.bias', 'bbox_embed.0.layers.1.weight', 'bbox_embed.0.layers.1.bias', 'bbox_embed.0.layers.2.weight', 'bbox_embed.0.layers.2.bias', 'bbox_embed.1.layers.0.weight', 'bbox_embed.1.layers.0.bias', 'bbox_embed.1.layers.1.weight', 'bbox_embed.1.layers.1.bias', 'bbox_embed.1.layers.2.weight', 'bbox_embed.1.layers.2.bias', 'bbox_embed.2.layers.0.weight', 'bbox_embed.2.layers.0.bias', 'bbox_embed.2.layers.1.weight', 'bbox_embed.2.layers.1.bias', 'bbox_embed.2.layers.2.weight', 'bbox_embed.2.layers.2.bias', 'bbox_embed.3.layers.0.weight', 'bbox_embed.3.layers.0.bias', 'bbox_embed.3.layers.1.weight', 'bbox_embed.3.l

TrainOutput(global_step=210944, training_loss=4.2538691924034975, metrics={'train_runtime': 101197.4419, 'train_samples_per_second': 4.169, 'train_steps_per_second': 2.084, 'total_flos': 1.5236061080832914e+20, 'train_loss': 4.2538691924034975, 'epoch': 8.0})

In [8]:
trainer.train(resume_from_checkpoint="C:/Users/odara/Downloads/data/rtdetr_fine_tuning/checkpoint-210000")

There were missing keys in the checkpoint model loaded: ['class_embed.0.weight', 'class_embed.0.bias', 'class_embed.1.weight', 'class_embed.1.bias', 'class_embed.2.weight', 'class_embed.2.bias', 'class_embed.3.weight', 'class_embed.3.bias', 'class_embed.4.weight', 'class_embed.4.bias', 'class_embed.5.weight', 'class_embed.5.bias', 'bbox_embed.0.layers.0.weight', 'bbox_embed.0.layers.0.bias', 'bbox_embed.0.layers.1.weight', 'bbox_embed.0.layers.1.bias', 'bbox_embed.0.layers.2.weight', 'bbox_embed.0.layers.2.bias', 'bbox_embed.1.layers.0.weight', 'bbox_embed.1.layers.0.bias', 'bbox_embed.1.layers.1.weight', 'bbox_embed.1.layers.1.bias', 'bbox_embed.1.layers.2.weight', 'bbox_embed.1.layers.2.bias', 'bbox_embed.2.layers.0.weight', 'bbox_embed.2.layers.0.bias', 'bbox_embed.2.layers.1.weight', 'bbox_embed.2.layers.1.bias', 'bbox_embed.2.layers.2.weight', 'bbox_embed.2.layers.2.bias', 'bbox_embed.3.layers.0.weight', 'bbox_embed.3.layers.0.bias', 'bbox_embed.3.layers.1.weight', 'bbox_embed.3.l

Epoch,Training Loss,Validation Loss
8,10.7609,10.198097
9,10.849,10.234483
10,10.6697,10.179273


There were missing keys in the checkpoint model loaded: ['class_embed.0.weight', 'class_embed.0.bias', 'class_embed.1.weight', 'class_embed.1.bias', 'class_embed.2.weight', 'class_embed.2.bias', 'class_embed.3.weight', 'class_embed.3.bias', 'class_embed.4.weight', 'class_embed.4.bias', 'class_embed.5.weight', 'class_embed.5.bias', 'bbox_embed.0.layers.0.weight', 'bbox_embed.0.layers.0.bias', 'bbox_embed.0.layers.1.weight', 'bbox_embed.0.layers.1.bias', 'bbox_embed.0.layers.2.weight', 'bbox_embed.0.layers.2.bias', 'bbox_embed.1.layers.0.weight', 'bbox_embed.1.layers.0.bias', 'bbox_embed.1.layers.1.weight', 'bbox_embed.1.layers.1.bias', 'bbox_embed.1.layers.2.weight', 'bbox_embed.1.layers.2.bias', 'bbox_embed.2.layers.0.weight', 'bbox_embed.2.layers.0.bias', 'bbox_embed.2.layers.1.weight', 'bbox_embed.2.layers.1.bias', 'bbox_embed.2.layers.2.weight', 'bbox_embed.2.layers.2.bias', 'bbox_embed.3.layers.0.weight', 'bbox_embed.3.layers.0.bias', 'bbox_embed.3.layers.1.weight', 'bbox_embed.3.l

TrainOutput(global_step=263680, training_loss=2.2046716252576957, metrics={'train_runtime': 62786.4567, 'train_samples_per_second': 8.399, 'train_steps_per_second': 4.2, 'total_flos': 1.9045076351041143e+20, 'train_loss': 2.2046716252576957, 'epoch': 10.0})

In [9]:
output_dir = "C:/Users/odara/Downloads/data/rtdetr_fine_tuning/final_model"
trainer.save_model(output_dir)