In [1]:
import json

In [4]:
def load_tags(path: str):
    with open(path, "r", encoding="utf-8") as file:
        tags_txt = file.read()
    tags = tags_txt.split("\n")
    return tags


general_general = load_tags("./popular-tags/general-general.txt")
general_sensitive = load_tags("./popular-tags/general-sensitive.txt")
general_questionable = load_tags("./popular-tags/general-questionable.txt")
general_explicit = load_tags("./popular-tags/general-explicit.txt")

In [5]:
character = load_tags("./popular-tags/character.txt")
copyright = load_tags("./popular-tags/copyright.txt")

In [6]:
special_tokens = [
    "<|bos|>",
    "<|eos|>",
    "<|pad|>",
    "<|unknown|>",
    "<rating>",
    "</rating>",
    "<copyright>",
    "</copyright>",
    "<character>",
    "</character>",
    "<general>",
    "</general>",
]

reserved_tokens = [f"<|reserved_{i}|>" for i in range(32)]

In [7]:
rating_tags = [
    "rating:general",
    "rating:sensitive",
    "rating:questionable",
    "rating:explicit",
    "rating:sfw",
    "rating:nsfw",
]

In [8]:
all_tags = (
    special_tokens
    + reserved_tokens
    + rating_tags
    + copyright
    + character
    + general_general
    + general_sensitive
    + general_questionable
    + general_explicit
)

In [9]:
len(all_tags)

67998

In [10]:
len(set(all_tags))  # ダブってる！！(収集中にカテゴリが変更された)

67996

In [11]:
unique_all_tags = set(all_tags)
for tag in all_tags:
    if tag in unique_all_tags:
        unique_all_tags.remove(tag)
    elif tag not in unique_all_tags:
        print(tag)

angel (evangelion)
coca-cola


In [12]:
print("coca-cola" in general_general)
print("angel (evangelion)" in character)

True
True


In [13]:
general_general.remove("coca-cola")
character.remove("angel (evangelion)")

In [14]:
all_tags = (
    special_tokens
    + reserved_tokens
    + rating_tags
    + copyright
    + character
    + general_general
    + general_sensitive
    + general_questionable
    + general_explicit
)

In [15]:
len(all_tags)

67996

In [16]:
all_tags[:100]

['<|bos|>',
 '<|eos|>',
 '<|pad|>',
 '<|unknown|>',
 '<rating>',
 '</rating>',
 '<copyright>',
 '</copyright>',
 '<character>',
 '</character>',
 '<general>',
 '</general>',
 '<|reserved_0|>',
 '<|reserved_1|>',
 '<|reserved_2|>',
 '<|reserved_3|>',
 '<|reserved_4|>',
 '<|reserved_5|>',
 '<|reserved_6|>',
 '<|reserved_7|>',
 '<|reserved_8|>',
 '<|reserved_9|>',
 '<|reserved_10|>',
 '<|reserved_11|>',
 '<|reserved_12|>',
 '<|reserved_13|>',
 '<|reserved_14|>',
 '<|reserved_15|>',
 '<|reserved_16|>',
 '<|reserved_17|>',
 '<|reserved_18|>',
 '<|reserved_19|>',
 '<|reserved_20|>',
 '<|reserved_21|>',
 '<|reserved_22|>',
 '<|reserved_23|>',
 '<|reserved_24|>',
 '<|reserved_25|>',
 '<|reserved_26|>',
 '<|reserved_27|>',
 '<|reserved_28|>',
 '<|reserved_29|>',
 '<|reserved_30|>',
 '<|reserved_31|>',
 'rating:general',
 'rating:sensitive',
 'rating:questionable',
 'rating:explicit',
 'rating:sfw',
 'rating:nsfw',
 'kamen rider dcd',
 'idolmaster 2',
 'immaterial and missing power',
 'chikanoko

In [2]:
from tokenizers import Tokenizer, AddedToken, Regex
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Split
from tokenizers.normalizers import Lowercase
from tokenizers.processors import TemplateProcessing

In [17]:
tokenizer = Tokenizer(
    WordLevel(vocab={tag: i for i, tag in enumerate(all_tags)}, unk_token="<|unknown|>")
)

In [18]:
tokenizer.normalizer = Lowercase()

In [19]:
# "1girl, black hair,        cat ears,cowboy shot"
# ↓
# ["1girl", "black hair", "cat ears", "cowboy shot"]
tokenizer.pre_tokenizer = Split(
    pattern=Regex(r",(?:\s)*"), behavior="removed", invert=False
)

In [20]:
tokenizer.add_special_tokens(
    [
        AddedToken(
            content=tag,
        )
        for tag in special_tokens + reserved_tokens
    ]
)

44

In [21]:
PAD_TOKEN = "<|pad|>"
tokenizer.enable_padding(pad_token=PAD_TOKEN)
tokenizer.padding

{'length': None,
 'pad_to_multiple_of': None,
 'pad_id': 0,
 'pad_token': '<|pad|>',
 'pad_type_id': 0,
 'direction': 'right'}

In [22]:
tokenizer.encode(
    "1girl, 2girls, aaa, long hair, very long hair, honkai: star rail, arknights, (arknights)"
).tokens

['1girl',
 '2girls',
 '<|unknown|>',
 'long hair',
 'very long hair',
 'honkai: star rail',
 'arknights',
 '<|unknown|>']

In [26]:
tokenizer.save("tokenizer.json")

## Load in transformers


In [23]:
from transformers import PreTrainedTokenizerFast

In [27]:
pretrained_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")

In [28]:
pretrained_tokenizer.tokenize(
    "<|bos|>1GIRL, 1girl, 2girls, aaa, long hair, very long hair, honkai: star rail, arknights, (arknights), <|eos|><|pad|><|pad|><|pad|>"
)

['<|bos|>',
 '1girl',
 '1girl',
 '2girls',
 '<|unknown|>',
 'long hair',
 'very long hair',
 'honkai: star rail',
 'arknights',
 '<|unknown|>',
 '<|eos|>',
 '<|pad|>',
 '<|pad|>',
 '<|pad|>']

In [29]:
pretrained_tokenizer.bos_token = "<|bos|>"
pretrained_tokenizer.eos_token = "<|eos|>"
pretrained_tokenizer.unk_token = "<|unknown|>"

In [30]:
pretrained_tokenizer.tokenize(
    """ 1girl,
solo,    
looking at viewer
""".strip()
)

['1girl', 'solo', 'looking at viewer']

In [31]:
pretrained_tokenizer.encode_plus("<|bos|>1girl, solo")

{'input_ids': [0, 60268, 40044], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [32]:
# pretrained_tokenizer.push_to_hub("p1atdev/tokenizer_test_1")

In [33]:
pretrained_tokenizer.save_pretrained("./dart-tokenizer-20240219")

('./dart-tokenizer-20240219\\tokenizer_config.json',
 './dart-tokenizer-20240219\\special_tokens_map.json',
 './dart-tokenizer-20240219\\tokenizer.json')

## Create tag category map


In [51]:
general_general_ids = [tokenizer.token_to_id(tag) for tag in general_general]
assert min(general_general_ids) == general_general_ids[0]

general_sensitive_ids = [tokenizer.token_to_id(tag) for tag in general_sensitive]
assert min(general_sensitive_ids) == general_sensitive_ids[0]

general_questionable_ids = [tokenizer.token_to_id(tag) for tag in general_questionable]
assert min(general_questionable_ids) == general_questionable_ids[0]

general_explicit_ids = [tokenizer.token_to_id(tag) for tag in general_explicit]
assert min(general_explicit_ids) == general_explicit_ids[0]

In [52]:
character_ids = [tokenizer.token_to_id(tag) for tag in character]
assert min(character_ids) == character_ids[0]

copyright_ids = [tokenizer.token_to_id(tag) for tag in copyright]
assert min(copyright_ids) == copyright_ids[0]

In [53]:
rating_ids = [tokenizer.token_to_id(tag) for tag in rating_tags]

In [49]:
def load_tag_category():
    with open("dart-config/tag_category.jsonc", "r", encoding="utf-8") as file:
        tag_category = json.loads(file.read())
    return tag_category


def save_tag_category(data):
    with open("dart-config/tag_category.json", "w", encoding="utf-8") as file:
        file.write(json.dumps(data, ensure_ascii=False))


tag_category = load_tag_category()

In [55]:
tag_category["category_to_token_ids"]["1"] = rating_ids
tag_category["category_to_token_ids"]["2"] = copyright_ids
tag_category["category_to_token_ids"]["3"] = character_ids
tag_category["category_to_token_ids"]["4"] = (
    general_general_ids
    + general_sensitive_ids
    + general_questionable_ids
    + general_explicit_ids
)

In [56]:
save_tag_category(tag_category)