# wd-v1-4-swinv2-tagger


In [1]:
# clone the swinv2 model
!git clone https://huggingface.co/SmilingWolf/wd-v1-4-swinv2-tagger-v2 ./tagger

fatal: destination path './tagger' already exists and is not an empty directory.


## imports


In [None]:
import os

os.environ["KERAS_BACKEND"] = "torch"

import pandas as pd

import numpy as np
import torch
import torch.nn as nn
from keras.models import load_model, Model
from transformers import Swinv2Config, Swinv2ForImageClassification

In [2]:
config = Swinv2Config.from_pretrained("./swinv2-config")

## Load and set labels


In [3]:
df = pd.read_csv("./tagger/selected_tags.csv")
df.head()

Unnamed: 0,tag_id,name,category,count
0,9999999,general,9,807858
1,9999998,sensitive,9,3771700
2,9999997,questionable,9,769899
3,9999996,explicit,9,560281
4,470575,1girl,0,4225150


In [10]:
def convert_tag_name(tag: str, category: int):
    if category == 0:
        return tag
    elif category == 4:
        return f"character:{tag}"
    elif category == 9:
        return f"rating:{tag}"

In [11]:
id2label = {
    i: convert_tag_name(tag, df["category"][i]) for i, tag in enumerate(df["name"])
}
id2label

{0: 'rating:general',
 1: 'rating:sensitive',
 2: 'rating:questionable',
 3: 'rating:explicit',
 4: '1girl',
 5: 'solo',
 6: 'long_hair',
 7: 'breasts',
 8: 'looking_at_viewer',
 9: 'blush',
 10: 'smile',
 11: 'short_hair',
 12: 'open_mouth',
 13: 'bangs',
 14: 'blue_eyes',
 15: 'multiple_girls',
 16: 'skirt',
 17: 'blonde_hair',
 18: 'large_breasts',
 19: 'simple_background',
 20: 'brown_hair',
 21: 'shirt',
 22: 'black_hair',
 23: 'hair_ornament',
 24: 'red_eyes',
 25: 'thighhighs',
 26: 'gloves',
 27: 'long_sleeves',
 28: '1boy',
 29: 'hat',
 30: 'white_background',
 31: 'dress',
 32: 'bow',
 33: 'ribbon',
 34: 'navel',
 35: 'holding',
 36: '2girls',
 37: 'animal_ears',
 38: 'cleavage',
 39: 'hair_between_eyes',
 40: 'bare_shoulders',
 41: 'twintails',
 42: 'brown_eyes',
 43: 'jewelry',
 44: 'medium_breasts',
 45: 'sitting',
 46: 'very_long_hair',
 47: 'closed_mouth',
 48: 'underwear',
 49: 'nipples',
 50: 'school_uniform',
 51: 'green_eyes',
 52: 'blue_hair',
 53: 'standing',
 54: 

In [12]:
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}

In [13]:
config.save_pretrained("tagger-hf")

## Create hf SwinV2 model


In [7]:
swinv2 = Swinv2ForImageClassification._from_config(config)

## Load keras model


In [None]:
model: Model = load_model("tagger")

In [9]:
for layer in model.layers:
    print(layer.name, layer.input_shape)

input_1 [(None, 448, 448, 3)]
tf.cast (None, 448, 448, 3)
tf.math.subtract (None, 448, 448, 3)
tf.math.multiply (None, 448, 448, 3)
conv2d (None, 448, 448, 3)
reshape (None, 112, 112, 128)
layer_normalization (None, 12544, 128)
dropout (None, 12544, 128)
swin_transformer_block (None, 12544, 128)
swin_transformer_block_1 (None, 12544, 128)
patch_merging (None, 12544, 128)
swin_transformer_block_2 (None, 3136, 256)
swin_transformer_block_3 (None, 3136, 256)
patch_merging_1 (None, 3136, 256)
swin_transformer_block_4 (None, 784, 512)
swin_transformer_block_5 (None, 784, 512)
swin_transformer_block_6 (None, 784, 512)
swin_transformer_block_7 (None, 784, 512)
swin_transformer_block_8 (None, 784, 512)
swin_transformer_block_9 (None, 784, 512)
swin_transformer_block_10 (None, 784, 512)
swin_transformer_block_11 (None, 784, 512)
swin_transformer_block_12 (None, 784, 512)
swin_transformer_block_13 (None, 784, 512)
swin_transformer_block_14 (None, 784, 512)
swin_transformer_block_15 (None, 784, 5

References:

- https://github.com/SmilingWolf/SW-CV-ModelZoo/blob/main/Models/SwinV2.py
- https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/swinv2/modeling_swinv2.py


### utils


In [10]:
def convert_conv2d(array: np.ndarray):
    return nn.Parameter(
        torch.from_numpy(
            array,
        )
        .permute(3, 2, 0, 1)
        .contiguous()  # fix shape
    )


def convert_dense(array: np.ndarray):
    return nn.Parameter(
        torch.from_numpy(
            array,
        )
        .transpose(1, 0)
        .contiguous()  # fix shape
    )


def convert_tensor(array: np.ndarray):
    return nn.Parameter(torch.from_numpy(array).contiguous())

In [11]:
def get_weight(name: str, index: int = 0):
    return model.get_layer(name).weights[index].numpy()

## Swinv2Embeddings


In [12]:
swinv2.swinv2.embeddings.patch_embeddings.projection.weight = convert_conv2d(
    get_weight("conv2d", 0),  # 0 means weight
)
swinv2.swinv2.embeddings.patch_embeddings.projection.bias = convert_tensor(
    get_weight("conv2d", 1),  # 1 means bias
)

In [13]:
swinv2.swinv2.embeddings.norm.weight = convert_tensor(
    get_weight("layer_normalization", 0),
)
swinv2.swinv2.embeddings.norm.bias = convert_tensor(
    get_weight("layer_normalization", 1),
)

## Swinv2Encoder


swin_transformer_block has

- `norm1`
- `attn`
  - `cpb_mlp`
    - 0: `cpb_mlp/dense_0`
    - 1: cpb_mlp/relu
    - 2: `cpb_mlp_dense_1`
  - `qkv`
  - attn_drop
  - `proj`
  - proj_drop
  - softmax
- drop_path
- `norm2`
- `mlp`
  - `fc1`
  - act: gelu
  - `fc2`
  - drop


In [14]:
layer_num = 0

block_num = lambda x: f"_{x}" if x > 0 else ""

for i_layer, dim in enumerate(config.depths):
    for i_dim in range(dim):
        # print(i_layer, i_dim)
        block_name = f"swin_transformer_block{block_num(layer_num)}"

        swinv2_block = swinv2.swinv2.encoder.layers[i_layer].blocks[i_dim]

        # norm1
        swinv2_block.layernorm_before.weight = convert_tensor(
            model.get_layer(block_name).norm1.weights[0].numpy()
        )
        swinv2_block.layernorm_before.bias = convert_tensor(
            model.get_layer(block_name).norm1.weights[1].numpy()
        )

        # attn/cpb_mlp
        for i, layer in enumerate(model.get_layer(block_name).attn.cpb_mlp.layers):
            if i == 0 or i == 2:  # dense
                # print(layer.weights[0].numpy().shape)
                # print(swinv2_block.attention.self.continuous_position_bias_mlp)
                swinv2_block.attention.self.continuous_position_bias_mlp[i].weight = (
                    convert_dense(layer.weights[0].numpy())
                )
                if i == 0:
                    # the second dense doesn't have bias
                    swinv2_block.attention.self.continuous_position_bias_mlp[i].bias = (
                        convert_tensor(layer.weights[1].numpy())
                    )

        # print(model.get_layer(block_name).attn.qkv.weights[0].numpy().shape)

        # attn/qkv
        [
            swinv2_block.attention.self.query.weight,
            swinv2_block.attention.self.key.weight,
            swinv2_block.attention.self.value.weight,
        ] = [
            convert_dense(array)
            for array in np.split(
                model.get_layer(block_name).attn.qkv.weights[0].numpy(), 3, axis=1
            )
        ]
        swinv2_block.attention.self.query.bias = convert_tensor(
            model.get_layer(block_name).attn.q_bias.numpy()
        )
        swinv2_block.attention.self.value.bias = convert_tensor(
            model.get_layer(block_name).attn.v_bias.numpy()
        )

        # proj (output.dense)
        swinv2_block.attention.output.dense.weight = convert_dense(
            model.get_layer(block_name).attn.proj.weights[0].numpy()
        )
        swinv2_block.attention.output.dense.bias = convert_tensor(
            model.get_layer(block_name).attn.proj.weights[1].numpy()
        )

        # print(swinv2.swinv2.encoder.layers[i_layer].blocks[i_dim].attention.self)

        # norm2
        swinv2_block.layernorm_after.weight = convert_tensor(
            model.get_layer(block_name).norm2.weights[0].numpy()
        )
        swinv2_block.layernorm_after.bias = convert_tensor(
            model.get_layer(block_name).norm2.weights[1].numpy()
        )

        # mlp
        # -> Swinv2Intermediate and Swinv2Output (dense, activation, dense, dropout)
        swinv2_block.intermediate.dense.weight = convert_dense(
            model.get_layer(block_name).mlp.fc1.weights[0].numpy()
        )
        swinv2_block.intermediate.dense.bias = convert_tensor(
            model.get_layer(block_name).mlp.fc1.weights[1].numpy()
        )

        swinv2_block.output.dense.weight = convert_dense(
            model.get_layer(block_name).mlp.fc2.weights[0].numpy()
        )
        swinv2_block.output.dense.bias = convert_tensor(
            model.get_layer(block_name).mlp.fc2.weights[1].numpy()
        )

        layer_num += 1

### Patch merging


In [15]:
layer_num = 0

block_num = lambda x: f"_{x}" if x > 0 else ""

for i_layer, dim in enumerate(config.depths):
    if i_layer < len(config.depths) - 1:
        swinv2_block = swinv2.swinv2.encoder.layers[i_layer]
        block_name = f"patch_merging{block_num(layer_num)}"

        # reduction
        swinv2_block.downsample.reduction.weight = convert_dense(
            get_weight(block_name, 0)
        )

        # norm
        swinv2_block.downsample.norm.weight = convert_tensor(
            model.get_layer(block_name).norm.weights[0].numpy()
        )
        swinv2_block.downsample.norm.bias = convert_tensor(
            model.get_layer(block_name).norm.weights[1].numpy()
        )

    layer_num += 1

## LayerNorm


In [16]:
swinv2.swinv2.layernorm.weight = convert_tensor(get_weight("predictions_norm", 0))
swinv2.swinv2.layernorm.bias = convert_tensor(get_weight("predictions_norm", 1))

## classifier


In [17]:
swinv2.classifier.weight = convert_dense(get_weight("predictions_dense", 0))
swinv2.classifier.bias = convert_tensor(get_weight("predictions_dense", 1))

## Save model


In [18]:
swinv2.save_pretrained("./tagger-hf")