In [4]:
from transformers import ViTMAEConfig, ViTMAEModel, ViTMAEForPreTraining

# Initializing a ViT MAE vit-mae-base style configuration
configuration = ViTMAEConfig(
    image_size=100,
    num_channels=1,
    hidden_size=480,
    intermediate_size=1024,
    decoder_intermediate_size=1024,
    patch_size=10
)

# Initializing a model (with random weights) from the vit-mae-base style configuration
model = ViTMAEForPreTraining(configuration)

# Accessing the model configuration
configuration = model.config

print('number of parameters: ', sum(p.numel() for p in model.parameters()))


number of parameters:  40191300


In [5]:
from dataloader import BATCH_SIZE, square_xrd_dataloader

import torch
from torch import nn, optim
mse_loss = nn.MSELoss()

def train_model(num_epochs=100):
    outputs = []
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    for epoch in range(num_epochs):
        for idx, data in enumerate(square_xrd_dataloader):
            # ===================forward=====================
            output = model(data)
            loss = mse_loss(output.logits, data.squeeze())
            # ===================backward====================
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


            if idx % 5 == 0:
                print(f"Finished batch {idx} in epoch {epoch + 1}. Loss: {loss.item():.4f}")

        print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, num_epochs, loss.item()))
        outputs.append((epoch, data, output))



# Train the model

model.train(True)
train_model(num_epochs=1)
model.train(False)

Finished batch 0 in epoch 1. Loss: 15.2311
Finished batch 5 in epoch 1. Loss: 14.5613
Finished batch 10 in epoch 1. Loss: 12.5540
Finished batch 15 in epoch 1. Loss: 12.3391
Finished batch 20 in epoch 1. Loss: 14.0319
Finished batch 25 in epoch 1. Loss: 13.6332
Finished batch 30 in epoch 1. Loss: 13.4403
Finished batch 35 in epoch 1. Loss: 13.4084
Finished batch 40 in epoch 1. Loss: 14.4012
Finished batch 45 in epoch 1. Loss: 14.2402
Finished batch 50 in epoch 1. Loss: 16.8424


KeyboardInterrupt: 

In [13]:
from transformers import AutoFeatureExtractor, ViTMAEForPreTraining
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/vit-mae-base")
model_pretrained = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")

inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model_pretrained(**inputs)
loss = outputs.loss
mask = outputs.mask
ids_restore = outputs.ids_restore

In [11]:
inputs

{'pixel_values': tensor([[[[ 0.3138,  0.4337,  0.4679,  ..., -0.3541, -0.3369, -0.3369],
          [ 0.3652,  0.4337,  0.4679,  ..., -0.3541, -0.3541, -0.3883],
          [ 0.3138,  0.3994,  0.4166,  ..., -0.4397, -0.4226, -0.4054],
          ...,
          [ 1.8893,  1.7865,  1.6667,  ...,  1.5982,  1.4783,  1.4098],
          [ 1.8722,  1.8037,  1.7523,  ...,  1.3413,  1.0844,  0.9303],
          [ 1.8550,  1.7180,  1.7180,  ...,  0.2282, -0.0458, -0.3541]],

         [[-1.5980, -1.6155, -1.6155,  ..., -1.7906, -1.7906, -1.8081],
          [-1.5630, -1.5630, -1.5630,  ..., -1.7556, -1.7556, -1.7731],
          [-1.6155, -1.5980, -1.5630,  ..., -1.7906, -1.7906, -1.7906],
          ...,
          [-0.4076, -0.5126, -0.6176,  ..., -0.7577, -0.8277, -0.8803],
          [-0.4076, -0.4601, -0.5651,  ..., -0.8803, -1.0203, -1.0903],
          [-0.4251, -0.5651, -0.5826,  ..., -1.4405, -1.5455, -1.6681]],

         [[-0.7936, -0.6193, -0.6541,  ..., -1.2293, -1.1247, -1.1770],
          [-0

In [14]:
inputs['pixel_values'].type()

'torch.FloatTensor'