In [1]:
model_names=[
    "microsoft/swinv2-tiny-patch4-window16-256",
    "facebook/dinov2-base",
    "nvidia/MambaVision-B-1K",
    "microsoft/beit-base-patch16-224",
    "microsoft/swinv2-base-patch4-window16-256",
    "google/vit-base-patch16-384",
]

In [2]:
DEVICE="cuda"

In [69]:
import sklearn.metrics

In [49]:
import numpy as np

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
print(torch.cuda.is_available())

class ImageMultiRegressionModel(nn.Module):
    def __init__(self, model, loss=nn.MSELoss(), output_size=1):
        super().__init__()
        self.model = model
        self.classifier = nn.Linear(self.model.config.hidden_size, output_size)
        self.loss=loss
    
    def forward(self, pixel_values, labels=None):
        outputs = self.model(pixel_values=pixel_values)
        cls_output = outputs.last_hidden_state[:, 0, :]  # image embedding
        values = self.classifier(cls_output)
        loss = None
        if labels is not None:
            loss = self.loss(values.view(-1), labels.view(-1))
        return (loss, values) if loss is not None else values
    


True


In [4]:
from transformers import AutoModel, AutoImageProcessor

In [5]:
from PIL import Image

In [6]:
raw_model=AutoModel.from_pretrained(model_names[0])
model=ImageMultiRegressionModel(raw_model, output_size=3).to(DEVICE)

In [7]:
image_processor=AutoImageProcessor.from_pretrained(model_names[0])

In [10]:
inputs=image_processor(Image.open("./images/2024/image_2.jpg"), return_tensors="pt")["pixel_values"].to(DEVICE)

In [12]:
result=model(inputs, labels=torch.tensor([[0,0,0]]).to(DEVICE))

In [13]:
result

(tensor(0.0049, device='cuda:0', grad_fn=<MseLossBackward0>),
 tensor([[ 0.0620,  0.0718, -0.0760]], device='cuda:0',
        grad_fn=<AddmmBackward0>))

In [20]:
from datasets import load_from_disk
dataset=load_from_disk("./data/dataset/")
dataset["train"].set_format("torch")
dataset["test"].set_format("torch")

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import torchvision.transforms.v2 as transforms
import torch

_train_transform = transforms.Compose([
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True),  # Normalize expects float input
    transforms.RandomRotation(degrees=20),
    transforms.RandomHorizontalFlip(),
    transforms.RandomResizedCrop(size=(256,256), scale=(.6,1.0), antialias=True),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def train_transform(ex):
    ex["pixel_values"]=[_train_transform(image) for image in ex["image"]]
    return ex
    
_test_transform = transforms.Compose([
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True),  # Normalize expects float input
    transforms.Resize(size=(256,256)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def test_transform(ex):
    ex["pixel_values"]=[_test_transform(image) for image in ex["image"]]
    return ex

#_columns=["pixel_values", "light_level", "fume_strength", "explosion_strength"]
dataset["train"].set_transform(train_transform)
dataset["test"].set_transform(test_transform)

In [21]:
from transformers import TrainingArguments, Trainer

In [22]:
import os
os.environ["NCCL_P2P_DISABLE"]="1"
os.environ["NCCL_IB_DISABLE"]="1"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [23]:
trainer_args = TrainingArguments(
    f"~/tmp/model-training/modelname",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    #learning_rate=5e-5,
    fp16=True,
    per_device_train_batch_size=12,
    #gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=25,
    warmup_ratio=0.1,
    logging_steps=25,
    load_best_model_at_end=True,
    remove_unused_columns=False,
    #metric_for_best_model="accuracy",
    push_to_hub=False,
    report_to=[]
)


In [30]:
_columns=["light_level","fume_strength","explosion_strength"]
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])#.to(DEVICE)
    labels =  torch.stack([torch.tensor([example[c] for c in _columns]) for example in examples])#.to(DEVICE)
    return {"pixel_values": pixel_values, "labels": labels, }

In [81]:
def compute_metrics(pred):
    x,y=pred.predictions, pred.label_ids
    return {"MSE":sklearn.metrics.mean_squared_error(x, y),
           "MAE":sklearn.metrics.mean_absolute_error(x,y),
           "R2":sklearn.metrics.r2_score(x,y)}

In [82]:
trainer = Trainer(
    model,
    trainer_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn
)

In [83]:
trainer.evaluate()

{'eval_loss': 0.8092214465141296,
 'eval_model_preparation_time': 0.0124,
 'eval_MSE': 0.8092216849327087,
 'eval_MAE': 0.7503647804260254,
 'eval_R2': -3.5388848781585693,
 'eval_runtime': 17.8937,
 'eval_samples_per_second': 105.68,
 'eval_steps_per_second': 3.353}