In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install piexif
!pip install -U keras-cv
!pip install datasets transformers
!pip install evaluate

Collecting piexif
  Downloading piexif-1.1.3-py2.py3-none-any.whl.metadata (3.7 kB)
Downloading piexif-1.1.3-py2.py3-none-any.whl (20 kB)
Installing collected packages: piexif
Successfully installed piexif-1.1.3
Collecting keras-cv
  Downloading keras_cv-0.9.0-py3-none-any.whl.metadata (12 kB)
Collecting keras-core (from keras-cv)
  Downloading keras_core-0.1.7-py3-none-any.whl.metadata (4.3 kB)
Downloading keras_cv-0.9.0-py3-none-any.whl (650 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m650.7/650.7 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras-core, keras-cv
Successfully installed keras-core-0.1.7 keras-cv-0.9.0
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  D

In [3]:
folder = "/content/drive/MyDrive/eagleeye/eagle_eyes_dataset"

In [4]:
import os
import cv2
import numpy as np
import piexif
import json
from PIL import Image
from typing import Any, Mapping, Tuple, Optional, TypedDict, Dict, List
from datetime import datetime, tzinfo, timezone
from piexif import GPSIFD
from dataclasses import dataclass, field
from piexif.helper import UserComment
import traceback
import os
import random
import torch
import evaluate
from torch.utils.data import Dataset
from transformers import AutoImageProcessor, TrainingArguments, Trainer, AutoModelForImageClassification
from transformers import DefaultDataCollator
import torchvision.transforms as T
from torch.utils.data import DataLoader, random_split
from torchvision.ops import box_iou
import matplotlib.pyplot as plt
import albumentations
from sklearn.metrics import precision_recall_fscore_support, average_precision_score

In [5]:
@dataclass
class GPSInfo:
    latitude: Optional[float] = None
    longitude: Optional[float] = None
    altitude: Optional[float] = None

@dataclass
class TiffImageMetadata:
    """ Some standardized metadata that can be stored in a TIFF file
    Note that because of constraints on the structure of EXIF data,
    the precise values of date_time and gps_info may change slightly
    when saved and loaded back from a TIFF file.

    The data in jsonable_metadata will be identical though.
    """
    date_time: Optional[datetime] = None  # Date time (with timezone) of the image
    gps_info: Optional[GPSInfo] = None
    jsonable_metadata: Optional[Mapping[str, Any]] = field(default=None)


def numdem_to_float(num_dem: Tuple[int, int]) -> float:
    return num_dem[0] / num_dem[1]


def decimal_degree_to_dms_num_dem(value: float, loc: str) -> Tuple[Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]], str]:
    """Convert decimal degrees to degrees, minutes, seconds tuple in EXIF format."""
    if value < 0:
        loc_value = loc[1]
    else:
        loc_value = loc[0]
    abs_value = abs(value)
    deg = int(abs_value)
    min = int((abs_value - deg) * 60)
    sec = (abs_value - deg - min / 60) * 3600 * 100

    # Format for EXIF
    deg = (deg, 1)
    min = (min, 1)
    sec = (int(sec), 100)
    return (deg, min, sec), loc_value


def dms_num_dem_to_decimal_degree(dms: Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]], loc: str) -> float:
    """Convert degrees, minutes, seconds tuple in EXIF format to decimal degrees."""
    # deg, min, sec = dms
    # value = deg[0] + min[0]/60 + sec[0]/3600
    # (deg_num, deg_denom), (min_num, min_denom), (sec_num, sec_denom) = dms
    deg_numdem, min_numdem, sec_numdem = dms
    value = numdem_to_float(deg_numdem) + numdem_to_float(min_numdem) / 60 + numdem_to_float(sec_numdem) / 3600
    if loc in ['S', 'W']:
        value = -value
    else:
        assert loc in ['N', 'E'], f"Invalid loc: {loc}"
    return value


def metadata_to_exif_dict(metadata: TiffImageMetadata) -> Mapping[str, Any]:
    """ Turn a metadata object into exif bytes in the standard format """
    exif_dict = {"GPS": {}, "Exif": {}}

    # Serialize jsonable metadata to a JSON string and include it in EXIF
    if metadata.jsonable_metadata:
        json_metadata = json.dumps(metadata.jsonable_metadata)
        exif_dict['Exif'][piexif.ExifIFD.UserComment] = UserComment.dump(json_metadata)

    # DateTime
    date_time = metadata.date_time
    if metadata.date_time:
        # Store utc time in 0th and local time in Exif
        dt_in_utc = date_time.astimezone(timezone.utc)
        exif_dict['0th'] = {piexif.ImageIFD.DateTime: dt_in_utc.strftime("%Y:%m:%d %H:%M:%S")}
        exif_dict['Exif'][piexif.ExifIFD.DateTimeOriginal] = date_time.strftime("%Y:%m:%d %H:%M:%S")

        #
        # metadata_dict['0th'] = {piexif.ImageIFD.DateTime: metadata.date_time.strftime("%Y:%m:%d %H:%M:%S") if metadata.date_time else ''}
        # metadata_dict['Exif'][piexif.ExifIFD.DateTimeOriginal] = metadata.date_time.strftime("%Y:%m:%d %H:%M:%S") if metadata.date_time else ''

    # GPS Information
    if metadata.gps_info:
        if metadata.gps_info.latitude is not None and metadata.gps_info.longitude is not None:
            exif_dict['GPS'][piexif.GPSIFD.GPSLatitude], exif_dict['GPS'][piexif.GPSIFD.GPSLatitudeRef] \
                = decimal_degree_to_dms_num_dem(metadata.gps_info.latitude, "NS")
            exif_dict['GPS'][piexif.GPSIFD.GPSLongitude], exif_dict['GPS'][piexif.GPSIFD.GPSLongitudeRef] \
                = decimal_degree_to_dms_num_dem(metadata.gps_info.longitude, "EW")
            # metadata_dict['GPS'][piexif.GPSIFD.GPSLongitudeRef] = 'E' if metadata.gps_info.longitude >= 0 else 'W'
            # metadata_dict['GPS'][piexif.GPSIFD.GPSLongitude] = decimal_degree_to_dms_num_dem(metadata.gps_info.longitude, "EW")

        if metadata.gps_info.altitude is not None:
            exif_dict['GPS'][piexif.GPSIFD.GPSAltitudeRef] = 0 if metadata.gps_info.altitude >= 0 else 1
            exif_dict['GPS'][piexif.GPSIFD.GPSAltitude] = (abs(int(metadata.gps_info.altitude * 100)), 100)

    return exif_dict

In [6]:
def load_tiff_metadata(path: str) -> TiffImageMetadata:
    """Load JSON serialized metadata from a TIFF file."""
    # Extract EXIF data
    exif_data = piexif.load(path)

    # Deserialize JSON metadata from custom EXIF tag if exists
    if 'Exif' in exif_data and piexif.ExifIFD.UserComment in exif_data['Exif']:
        metadata_int_tuple = exif_data['Exif'][piexif.ExifIFD.UserComment]
        json_metadata_str = piexif.helper.UserComment.load(bytes(metadata_int_tuple))
        json_metadata = json.loads(json_metadata_str)
    else:
        json_metadata = None

    # Convert EXIF data back to ExifDataDict format
    dt_entry = exif_data.get('0th', {}).get(piexif.ImageIFD.DateTime, None)
    if dt_entry:
        datetime_local = datetime.strptime(dt_entry.decode('utf-8'), "%Y:%m:%d %H:%M:%S")
        datetime_utc = datetime.strptime(exif_data.get('0th', {}).get(piexif.ImageIFD.DateTime, '').decode('utf-8'), "%Y:%m:%d %H:%M:%S")
        tz_offset = datetime_local - datetime_utc
        datetime_localized = datetime_local.replace(tzinfo=timezone(tz_offset))
    else:
        datetime_localized = None

    gps_entry = exif_data.get('GPS', {})
    if gps_entry:
        try:
            gps_info = GPSInfo(
                latitude=dms_num_dem_to_decimal_degree(exif_data.get('GPS', {}).get(GPSIFD.GPSLatitude, ((0, 1), (0, 1), (0, 1))),
                                                       exif_data.get('GPS', {}).get(GPSIFD.GPSLatitudeRef, '').decode('utf-8')),
                longitude=dms_num_dem_to_decimal_degree(exif_data.get('GPS', {}).get(GPSIFD.GPSLongitude, ((0, 1), (0, 1), (0, 1))),
                                                        exif_data.get('GPS', {}).get(GPSIFD.GPSLongitudeRef, '').decode('utf-8')),
                altitude=numdem_to_float(exif_data.get('GPS', {}).get(GPSIFD.GPSAltitude, (0, 1)))

            )
        except Exception as err:
            print(f"Error when attempting to read GPS data from {path}.  GPS info will be missing")
            print(traceback.format_exc())
            gps_info = None

    else:
        gps_info = None

    metadata = TiffImageMetadata(
        date_time=datetime_localized,
        gps_info=gps_info,
        jsonable_metadata=json_metadata,

    )

    return metadata

In [7]:
img_paths = []
ann = []

for filename in os.listdir(folder):
    if filename.lower().endswith(('.tiff')):
        img_path = os.path.join(folder, filename)
        img_paths.append(img_path)
        boxes = load_tiff_metadata(img_path).jsonable_metadata['annotations']
        label = []

        for box in boxes:
            label.append(box['ijhw_box'])
        ann.append(label)

In [8]:
checkpoint = "WinKawaks/vit-small-patch16-224"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [9]:

class TiledAnomalyDataset(Dataset):
    def __init__(self, img_paths, ann_list, tile_size=224, transform=None):
        self.tile_size = tile_size
        self.transform = transform
        self.tiles = []
        self.labels = []
        self.image_ids = []

        global_tile_id = 0

        for img_id, (img_path, full_boxes) in enumerate(zip(img_paths, ann_list)):
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            h, w = img.shape[:2]

            tile_id = 0
            for y in range(0, h, tile_size):
                for x in range(0, w, tile_size):
                    tile = img[y:y+tile_size, x:x+tile_size]
                    th, tw = tile.shape[:2]

                    # Pad
                    if th < tile_size or tw < tile_size:
                        pad_tile = np.ones((tile_size, tile_size, 3), dtype=np.uint8) * 255
                        pad_tile[:th, :tw] = tile
                        tile = pad_tile

                    # Box matching (without normalizatio/resize/rescale)
                    matching_boxes = []
                    for box in full_boxes:
                        bx, by, bw, bh = box[1] - box[3]/2, box[0] - box[2]/2, box[3], box[2]
                        if (bx + bw > x and bx < x + tile_size and
                            by + bh > y and by < y + tile_size):
                            rel_x = (bx - x) / tile_size
                            rel_y = (by - y) / tile_size
                            rel_w = bw / tile_size
                            rel_h = bh / tile_size
                            matching_boxes.append([rel_x, rel_y, rel_w, rel_h])

                    label = matching_boxes[0] if matching_boxes else [-1, -1, -1, -1]

                    self.tiles.append(tile)
                    self.labels.append(label)
                    self.image_ids.append(img_id * 1000 + tile_id)

                    tile_id += 1
                    global_tile_id += 1

        combined = list(zip(self.tiles, self.labels, self.image_ids))
        random.shuffle(combined)
        self.tiles, self.labels, self.image_ids = zip(*combined)

    def __len__(self):
        return len(self.tiles)

    def __getitem__(self, idx):
        tile = self.tiles[idx]
        label = np.array(self.labels[idx], dtype=np.float32)
        image_id = int(self.image_ids[idx])

        # Convert to tensor
        if self.transform:
            tile = self.transform(tile)
        else:
            tile = torch.from_numpy(tile.transpose(2, 0, 1)).float() / 255.0

        # if no box then class = 0 (no object) else 1 (has object)
        class_labels = 0 if label[0] == -1 else  1

        normalize = T.Normalize(
          mean=[0.485, 0.456, 0.406],
          std=[0.229, 0.224, 0.225]
        )

        tile = normalize(tile)

        return {
            "pixel_values":tile,
            "label": class_labels
        }

In [10]:
dataset = TiledAnomalyDataset(img_paths, ann)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=2)

In [None]:
print(len(dataset))

14220


In [11]:
total_size = len(dataset)
train_size = int(0.7 * total_size)
test_size = total_size - train_size

# Split dataset
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [12]:
from torch.utils.data import Subset

test_size = len(test_dataset)
split_index = int(0.5 * test_size)  # 50% for eval, 50% for final test

eval_dataset = Subset(test_dataset, range(0, split_index))
final_test_dataset = Subset(test_dataset, range(split_index, test_size))

In [None]:
print(len(train_dataset))
print(len(test_dataset))

9954
4266


In [13]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [14]:
from sklearn.metrics import accuracy_score, recall_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "recall": recall_score(labels, preds, average="macro"),  # or "binary" if binary classification
    }

In [15]:
from transformers import AutoModelForImageClassification
import torch.nn as nn
import torch

class WeightedLossModel(AutoModelForImageClassification):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Adjust weights: [neg_weight, pos_weight]
        weight = torch.tensor([0.2, 0.8]).to(logits.device)
        loss_fn = nn.CrossEntropyLoss(weight=weight)
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [16]:
label2id = {"anomaly": 1, "blank": 0}
id2label = {1: "anomaly", 0: "blank"}
model = WeightedLossModel.from_pretrained(
    checkpoint,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

model.safetensors:   0%|          | 0.00/88.2M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at WinKawaks/vit-small-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 384]) in the checkpoint and torch.Size([2, 384]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
data_collator = DefaultDataCollator()

In [18]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/eagleeye/finetuned",
    remove_unused_columns=False,
    eval_strategy="no",
    save_strategy="steps",
    fp16=True,
    save_steps=300,
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    warmup_ratio=0,
    logging_strategy="steps",
    logging_steps=30,
    report_to="none",
    metric_for_best_model="recall",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss
30,0.3469
60,0.0957
90,0.0913
120,0.1112
150,0.0871
180,0.0891
210,0.087
240,0.1237
270,0.0826
300,0.069


TrainOutput(global_step=624, training_loss=0.08003585537274678, metrics={'train_runtime': 159.0852, 'train_samples_per_second': 125.14, 'train_steps_per_second': 3.922, 'total_flos': 3.895683017108521e+17, 'train_loss': 0.08003585537274678, 'epoch': 2.0})

In [19]:
results = trainer.evaluate(eval_dataset=eval_dataset)
print(results)

{'eval_loss': 0.058742932975292206, 'eval_accuracy': 0.9873417721518988, 'eval_recall': 0.7376789677531659, 'eval_runtime': 7.9563, 'eval_samples_per_second': 268.088, 'eval_steps_per_second': 16.842, 'epoch': 2.0}


In [20]:
trainer.save_model("/content/drive/MyDrive/eagleeye/finetuned")

In [None]:
model_path = "/content/drive/MyDrive/eagleeye/finetuned/checkpoint-624"

In [None]:
model = AutoModelForImageClassification.from_pretrained(model_path)
model.eval()

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [None]:
training_args = TrainingArguments(
    output_dir="./temp-eval",  # can be any temp folder
    per_device_eval_batch_size=16,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # if you used one
    # processor=image_processor  # optional, used during training
)

In [None]:
results = trainer.evaluate()
print("Accuracy:", results["eval_accuracy"])
print("Recall:", results["eval_recall"])