In [1]:
import timm
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision import transforms
from PIL import Image
from pycocotools.coco import COCO
import os

def collate_fn(batch):
    return tuple(zip(*batch))


class CloverDataset(Dataset):
    def __init__(self, root_dir, annotation_file, transform=None):
        self.root_dir = root_dir
        self.coco = COCO(annotation_file)
        self.ids = list(self.coco.imgs.keys())
        self.transform = transform

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)
        img_info = self.coco.loadImgs(img_id)[0]
        img_path = os.path.join(self.root_dir, img_info['file_name'])
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        boxes = []
        labels = []
        for ann in anns:
            bbox = ann['bbox']
            boxes.append([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]])
            labels.append(ann['category_id'])
        
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        target = {'boxes': boxes, 'labels': labels, 'image_id': torch.tensor([img_id])}
        
        return image, target

transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Paths to the directories and annotation files
FLC_ROOT = '/Users/mattsloan/Downloads/FLC2019/'

train_images_dir = os.path.join(FLC_ROOT, 'trainval/JPEGImages')
train_annotation_file = os.path.join(FLC_ROOT, 'trainval/coco_annotations/instances_trainval_pos.json')
test_images_dir = os.path.join(FLC_ROOT, 'test/JPEGImages')
test_annotation_file = os.path.join(FLC_ROOT, 'test/coco_annotations/instances_test_pos.json')

# Create datasets
train_dataset = CloverDataset(root_dir=train_images_dir, annotation_file=train_annotation_file, transform=transform)
test_dataset = CloverDataset(root_dir=test_images_dir, annotation_file=test_annotation_file, transform=transform)

# Create data loaders with the custom collate function
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=collate_fn)


loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [3]:
class ConvNeXTBackbone(torch.nn.Module):
    def __init__(self, model_name='convnext_base'):
        super(ConvNeXTBackbone, self).__init__()
        self.backbone = timm.create_model(model_name, pretrained=True, features_only=True, out_indices=[3])

    def forward(self, x):
        x = self.backbone(x)[0]
        return x

backbone = ConvNeXTBackbone()
backbone.out_channels = 1024  # Adjust according to the backbone output channels

# RPN anchor generator
rpn_anchor_generator = AnchorGenerator(
    sizes=((32, 64, 128, 256, 512),),
    aspect_ratios=((0.5, 1.0, 2.0),) * len((32, 64, 128, 256, 512))
)

# ROI pooler
roi_pooler = torchvision.ops.MultiScaleRoIAlign(
    featmap_names=['0'], output_size=7, sampling_ratio=2
)

# Faster R-CNN model
model = FasterRCNN(backbone, num_classes=2,  # 2 classes (background and four leaf clover)
                   rpn_anchor_generator=rpn_anchor_generator,
                   box_roi_pool=roi_pooler)

# Move the model to the GPU, cuda, MPS, or cpu
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): ConvNeXTBackbone(
    (backbone): FeatureListNet(
      (stem_0): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      (stem_1): LayerNorm2d((128,), eps=1e-06, elementwise_affine=True)
      (stages_0): ConvNeXtStage(
        (downsample): Identity()
        (blocks): Sequential(
          (0): ConvNeXtBlock(
            (conv_dw): Conv2d(128, 128, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=128)
            (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
            (mlp): Mlp(
              (fc1): Linear(in_features=128, out_features=512, bias=True)
              (act): GELU()
              (drop1): Dropout(p=0.0, inplace=False)
              (norm): Identity()
              (fc2): Linear(in_features=512, out_features=128, bias=True)
              (drop

In [4]:
# Construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params, lr=0.001, weight_decay=0.0005)

# Training and evaluation function
def train_one_epoch(model, optimizer, data_loader, device, epoch):
    model.train()
    for images, targets in data_loader:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
    print(f"Epoch: {epoch}, Loss: {losses.item()}")

In [5]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, train_loader, device, epoch)
    torch.save(model.state_dict(), f"faster_rcnn_convnext_{epoch}.pth")


In [None]:
# Evaluation function
def evaluate(model, data_loader, device):
    model.eval()
    with torch.no_grad():
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            outputs = model(images)
            # Process outputs for evaluation, e.g., calculate mAP


In [None]:
# Run evaluation
evaluate(model, test_loader, device)