configs/vit/model_config/vit_base_p16.yaml

model:
  arch:
    type: ViTForImageClassification

  model_config:
    type: ViTConfig
    image_size: 224  # input image size
    patch_size: 16  # patch size
    num_channels: 3  # channels of input images
    initializer_range: 0.02  # initial std of cls_tokens, pos_embed and linear weight
    hidden_size: 768  # embedding dimension
    num_hidden_layers: 12  # number of transformer blocks
    num_attention_heads: 12  # number of attention heads
    intermediate_size: 3072  # ffn_hidden_size of encoder
    qkv_bias: True  # if has bias in qkv dense
    hidden_act: gelu  # activation of MLP
    post_layernorm_residual: False  # if using post layernorm residual
    layer_norm_eps: 0.000001  # eps of layer_norm
    attention_probs_dropout_prob: 0.  # drop rate of Attention
    hidden_dropout_prob: 0.  # drop rate of MLP
    drop_path_rate: 0.1  # drop path rate of Block
    use_mean_pooling: True  # if using global mean pooling
    num_labels: 1000  # num classes
    loss_type: SoftTargetCrossEntropy  # loss type
    checkpoint_name_or_path: 'vit_base_p16'

processor:
  type: ViTProcessor

  image_processor:
    type: ViTImageProcessor
    size: 224  # input image size