In [1]:



model_names=[
    "microsoft/swinv2-tiny-patch4-window16-256",
    "facebook/dinov2-base",
    "nvidia/MambaVision-B-1K",
    "microsoft/beit-base-patch16-224",
    "microsoft/swinv2-base-patch4-window16-256",
    "google/vit-base-patch16-384",
]

In [2]:
import torch
from torch import nn
import torch.nn.functional as F

class ImageMultiRegressionModel(nn.Module):
    def __init__(self, model, loss=nn.MSELoss(), output_size=1):
        super().__init__()
        self.model = model
        self.classifier = nn.Linear(self.model.config.hidden_size, output_size)
        self.loss=loss
    
    def forward(self, pixel_values, labels=None):
        outputs = self.model(pixel_values=pixel_values)
        cls_output = outputs.last_hidden_state[:, 0, :]  # image embedding
        values = self.classifier(cls_output)
        loss = None
        if labels is not None:
            loss = self.loss(values.view(-1), labels.view(-1))
        return (loss, values) if loss is not None else values
    


In [3]:
from transformers import AutoModel, AutoImageProcessor

In [4]:
from PIL import Image

In [5]:
raw_model=AutoModel.from_pretrained(model_names[0])
wrapped_model=ImageMultiRegressionModel(raw_model, output_size=3)

In [6]:
image_processor=AutoImageProcessor.from_pretrained(model_names[0])

In [7]:
inputs=image_processor(Image.open("./images/2024/image_2.jpg"), return_tensors="pt")["pixel_values"]

In [8]:
result=wrapped_model(inputs, labels=torch.tensor([[0,0,0]]))

In [9]:
result

(tensor(0.0099, grad_fn=<MseLossBackward0>),
 tensor([[ 0.1490, -0.0094,  0.0852]], grad_fn=<AddmmBackward0>))

In [10]:
from transformers import TrainingArguments, Trainer

In [11]:
import os
os.environ["NCCL_P2P_DISABLE"]="1"
os.environ["NCCL_IB_DISABLE"]="1"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [12]:
trainer_args = TrainingArguments(
    f"~/tmp/model-training/modelname",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    #learning_rate=5e-5,
    fp16=True,
    per_device_train_batch_size=12,
    #gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=25,
    warmup_ratio=0.1,
    logging_steps=25,
    load_best_model_at_end=True,
    #metric_for_best_model="accuracy",
    push_to_hub=False,
    report_to=[]
)


In [13]:
from datasets import load_from_disk
dataset=load_from_disk("./data/dataset/")

In [14]:
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(1000))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(1000))

Flattening the indices:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import torchvision.transforms as transforms
transform = transforms.Compose([transforms.PILToTensor()])

def totensor(ex):
    ex["pixel_values"]=transform(ex["pixel_values"])
    return ex
dataset["train"]=dataset["train"].map(totensor, num_proc=1)
dataset["test"]=dataset["test"].map(totensor)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ff420505510>>
Traceback (most recent call last):
  File "/home/miriam/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ff420505510>>
Traceback (most recent call last):
  File "/home/miriam/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [None]:
dataset["train"][0]["pixel_values"]

In [None]:
trainer = Trainer(
    wrapped_model,
    trainer_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    #compute_metrics=compute_metrics,
    #data_collator=collate_fn
)

In [None]:
trainer.evaluate()