In [1]:
model_names=[
    "microsoft/swinv2-tiny-patch4-window16-256",
    "microsoft/swinv2-base-patch4-window16-256",
    "facebook/dinov2-base",
    "nvidia/MambaVision-B-1K",
    "microsoft/beit-base-patch16-224",
    "google/vit-base-patch16-384",
]

In [2]:
DEVICE="cuda"

In [3]:
import sklearn.metrics

In [4]:
import numpy as np

In [5]:
from transformers import AutoModel, AutoImageProcessor

In [6]:
from PIL import Image

In [7]:
import torch
from torch import nn
import torch.nn.functional as F
print(torch.cuda.is_available())

True


In [9]:
from transformers import PreTrainedModel, PretrainedConfig, AutoConfig
class ImageMultiRegressionConfig(PretrainedConfig):
    def __init__(
        self,
        output_size=3,
        init_checkpoint=None,
        **kwargs,
    ):
        self.output_size=output_size
        self.init_checkpoint=init_checkpoint
        super().__init__(**kwargs)


class ImageMultiRegressionModel(PreTrainedModel):
    config_class=ImageMultiRegressionConfig
    def __init__(self, config, loss=nn.MSELoss()):
        super().__init__(config)
        
        self.inner_model = AutoModel.from_pretrained(config.init_checkpoint)
        self.classifier = nn.Linear(self.inner_model.config.hidden_size, config.output_size)
        self.loss=loss
    
    def forward(self, pixel_values, labels=None):
        outputs = self.inner_model(pixel_values=pixel_values)
        cls_output = outputs.last_hidden_state[:, 0, :]  # image embedding
        values = self.classifier(cls_output)
        loss = None
        if labels is not None:
            loss = self.loss(values.view(-1), labels.view(-1))
        return (loss, values) if loss is not None else values

In [10]:
from datasets import load_from_disk
dataset=load_from_disk("./data/dataset/")
dataset["train"].set_format("torch")
dataset["test"].set_format("torch")

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import torchvision.transforms.v2 as transforms
import torch

_train_transform = transforms.Compose([
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True),  # Normalize expects float input
    transforms.RandomRotation(degrees=20),
    transforms.RandomHorizontalFlip(),
    transforms.RandomResizedCrop(size=(256,256), scale=(.6,1.0), antialias=True),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def train_transform(ex):
    if "image" in ex:
        ex["pixel_values"]=[_train_transform(image) for image in ex["image"]]
    return ex
    
_test_transform = transforms.Compose([
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True),  # Normalize expects float input
    transforms.Resize(size=(256,256)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def test_transform(ex):
    if "image" in ex:
        ex["pixel_values"]=[_test_transform(image) for image in ex["image"]]
    return ex

#_columns=["pixel_values", "light_level", "fume_strength", "explosion_strength"]
dataset["train"].set_transform(train_transform)
dataset["test"].set_transform(test_transform)

In [12]:
t1=torch.tensor(dataset["train"]["light_level"])
t2=torch.tensor(dataset["train"]["fume_strength"])
t3=torch.tensor(dataset["train"]["explosion_strength"])

In [18]:
y1=torch.tensor(dataset["test"]["light_level"])
y2=torch.tensor(dataset["test"]["fume_strength"])
y3=torch.tensor(dataset["test"]["explosion_strength"])

In [22]:
def compute_metrics(x,y):
    return {"MSE":sklearn.metrics.mean_squared_error(x, y),
           "MAE":sklearn.metrics.mean_absolute_error(x,y),
           "R2_test":sklearn.metrics.r2_score(x,y)}

In [42]:
# https://stats.stackexchange.com/questions/228540/how-to-calculate-out-of-sample-r-squared/492581#492581
# https://arxiv.org/pdf/2302.05131
def oosR(MST, MSE): 
    return 1-MSE/MST

In [13]:
T=torch.stack([t1,t2,t3], dim=1)

In [19]:
Y=torch.stack([y1,y2,y3], dim=1)

In [11]:
# baseline: always predict mean

In [14]:
T_mean=T.mean(dim=0)

In [20]:
T_mean_only = T_mean.repeat(Y.shape[0],1)

In [21]:
T_mean_only.shape, Y.shape

(torch.Size([1891, 3]), torch.Size([1891, 3]))

In [16]:
T_mean_only[1]

tensor([0.9055, 0.7536, 0.3736])

In [24]:
mean_metrics = compute_metrics(T_mean_only, Y); mean_metrics

{'MSE': 0.1443669, 'MAE': 0.31176418, 'R2_test': 0.0}

In [25]:
# baseline: always predict median

In [29]:
T_median=T.median(dim=0).values

In [31]:
T_median_only = T_median.repeat(Y.shape[0],1)

In [32]:
T_median_only.shape, Y.shape

(torch.Size([1891, 3]), torch.Size([1891, 3]))

In [33]:
T_median_only[1]

tensor([1., 1., 0.])

In [35]:
median_metrics = compute_metrics(T_median_only, Y); median_metrics

{'MSE': 0.20900238, 'MAE': 0.23368602, 'R2_test': 0.0}

In [43]:
oosR(mean_metrics['MSE'], mean_metrics['MSE'])

0.0

In [44]:
oosR(mean_metrics['MSE'], median_metrics['MSE'])

-0.44771671295166016

In [47]:
oosR(mean_metrics['MSE'], 0.052876099944114685) #swinv2-base

0.6337380793057688

In [183]:
oosR(mean_metrics['MSE'], 0.07436350733041763) #dinov2

0.4848992067647788

In [184]:
oosR(mean_metrics['MSE'], 0.051811158657073975) #beit

0.6411147096100152

In [180]:
def reg_to_class(x): #[b, 3]
    t=np.full(x.shape[0], "INA")
    t_f=np.full(x.shape[0], "FUM")
    t_e=np.full(x.shape[0], "EXP")
    t_ef=np.full(x.shape[0], "EXP+FUM")
    t=np.where((x[:,1]>.75) & (x[:,2]>.1),t_ef,t)
    t=np.where((x[:,1]>.75) & (x[:,2]<=.1),t_f,t)
    t=np.where((x[:,1]<=.75) & (x[:,2]>.1),t_e,t)
    
    return t

In [181]:
class_Y=reg_to_class(T)

In [182]:
np.unique(class_Y, return_counts=True)

(array(['EXP', 'EXP+FUM', 'FUM', 'INA'], dtype='<U7'),
 array([ 460, 6358, 5821, 4378]))