## Load dataset


In [1]:
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import numpy as np

In [2]:
CACHE_DIR = "/huggingface/cache"


DATASET_NAME = "isek-ai/danbooru-tags-2023"

In [3]:
ds = load_dataset(DATASET_NAME, name="all", split="train", cache_dir=CACHE_DIR)
ds

Dataset({
    features: ['id', 'copyright', 'character', 'artist', 'general', 'meta', 'rating', 'score', 'created_at'],
    num_rows: 6574149
})

## Simple filtering


In [4]:
ds = ds.filter(lambda x: x["general"] is not None, batched=False)
ds

Dataset({
    features: ['id', 'copyright', 'character', 'artist', 'general', 'meta', 'rating', 'score', 'created_at'],
    num_rows: 6574005
})

## Create general tag lists by rating


In [5]:
general_general_tags = set()
general_sensitive_tags = set()
general_questionable_tags = set()
general_explicit_tags = set()

general_general_tag_counts = {}
general_sensitive_tag_counts = {}
general_questionable_tag_counts = {}
general_explicit_tag_counts = {}


def load_general_tags(examples):
    for i, general in enumerate(examples["general"]):
        rating = examples["rating"][i]
        for tag in general.split(", "):
            if rating == "g":
                general_general_tags.add(tag)
                if tag not in general_general_tag_counts:
                    general_general_tag_counts[tag] = 1
                else:
                    general_general_tag_counts[tag] += 1
            elif rating == "s":
                general_sensitive_tags.add(tag)
                if tag not in general_sensitive_tag_counts:
                    general_sensitive_tag_counts[tag] = 1
                else:
                    general_sensitive_tag_counts[tag] += 1
            elif rating == "q":
                general_questionable_tags.add(tag)
                if tag not in general_questionable_tag_counts:
                    general_questionable_tag_counts[tag] = 1
                else:
                    general_questionable_tag_counts[tag] += 1
            elif rating == "e":
                general_explicit_tags.add(tag)
                if tag not in general_explicit_tag_counts:
                    general_explicit_tag_counts[tag] = 1
                else:
                    general_explicit_tag_counts[tag] += 1


ds.map(load_general_tags, batched=True)

Map:   0%|          | 0/6574005 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'copyright', 'character', 'artist', 'general', 'meta', 'rating', 'score', 'created_at'],
    num_rows: 6574005
})

In [6]:
general_sensitive_tag_counts

{'1girl': 2740350,
 ':d': 264919,
 'artist name': 130379,
 'barefoot': 164808,
 'black hair': 629488,
 'black skirt': 127145,
 'brown eyes': 400564,
 'butterfly sitting': 540,
 'classroom': 6144,
 'copyright notice': 3932,
 'desk': 15274,
 'feet': 76312,
 'hairband': 220247,
 'indoors': 113642,
 'knees apart feet together': 735,
 'long hair': 1996089,
 'looking at viewer': 1502503,
 'not for sale': 73,
 'on desk': 2499,
 'open mouth': 952020,
 'pleated skirt': 237969,
 'school desk': 6454,
 'school uniform': 358545,
 'serafuku': 151347,
 'short sleeves': 267053,
 'sitting': 452256,
 'sitting on desk': 1705,
 'skirt': 768453,
 'smile': 1328998,
 'solo': 2408602,
 'window': 53248,
 'black leotard': 37189,
 'blue eyes': 802114,
 'breasts': 1499060,
 'bustier': 3250,
 'cleavage': 628136,
 'corset': 21700,
 'demon girl': 33470,
 'gothic': 2313,
 'green hair': 180497,
 'head wings': 27840,
 'huge breasts': 51766,
 'jewelry': 467775,
 'lace': 18503,
 'leotard': 113475,
 'lingerie': 21044,
 'n

In [7]:
# remove too few tags (less than 100)
general_general_tag_counts = {
    k: v for k, v in general_general_tag_counts.items() if v >= 100
}
general_sensitive_tag_counts = {
    k: v for k, v in general_sensitive_tag_counts.items() if v >= 100
}
general_questionable_tag_counts = {
    k: v for k, v in general_questionable_tag_counts.items() if v >= 100
}
general_explicit_tag_counts = {
    k: v for k, v in general_explicit_tag_counts.items() if v >= 100
}

In [8]:
general_sensitive_tag_counts

{'1girl': 2740350,
 ':d': 264919,
 'artist name': 130379,
 'barefoot': 164808,
 'black hair': 629488,
 'black skirt': 127145,
 'brown eyes': 400564,
 'butterfly sitting': 540,
 'classroom': 6144,
 'copyright notice': 3932,
 'desk': 15274,
 'feet': 76312,
 'hairband': 220247,
 'indoors': 113642,
 'knees apart feet together': 735,
 'long hair': 1996089,
 'looking at viewer': 1502503,
 'on desk': 2499,
 'open mouth': 952020,
 'pleated skirt': 237969,
 'school desk': 6454,
 'school uniform': 358545,
 'serafuku': 151347,
 'short sleeves': 267053,
 'sitting': 452256,
 'sitting on desk': 1705,
 'skirt': 768453,
 'smile': 1328998,
 'solo': 2408602,
 'window': 53248,
 'black leotard': 37189,
 'blue eyes': 802114,
 'breasts': 1499060,
 'bustier': 3250,
 'cleavage': 628136,
 'corset': 21700,
 'demon girl': 33470,
 'gothic': 2313,
 'green hair': 180497,
 'head wings': 27840,
 'huge breasts': 51766,
 'jewelry': 467775,
 'lace': 18503,
 'leotard': 113475,
 'lingerie': 21044,
 'necklace': 117862,
 'u

In [8]:
for s_tag in general_sensitive_tags.copy():
    if s_tag not in general_sensitive_tag_counts:
        general_sensitive_tags.remove(s_tag)
    elif s_tag in general_general_tags:
        general_sensitive_tags.remove(s_tag)

general_sensitive_tags

{'adhesive bra',
 'adjusting bra',
 'adjusting leotard',
 'adjusting panties',
 'adjusting swimsuit',
 'alternate color school swimsuit',
 'aqua bra',
 'aqua panties',
 'aqua sarong',
 'arena (company)',
 'areolae',
 'armpit focus',
 'artificial vagina',
 'artoria pendragon (lancer alter) (royal icing) (fate) (cosplay)',
 'ass grab',
 'ass support',
 'backboob',
 'bdsm',
 'bikini bottom pull',
 'bikini bridge',
 'bikini day',
 'bikini in mouth',
 'bikini pull',
 'bikini tan',
 'bikini top lift',
 'bikini tug',
 'bikini under shorts',
 'black bikini bottom',
 'black fundoshi',
 'bondage',
 'bow swimsuit',
 'bra lift',
 'bra pull',
 'breast poke',
 'bridal lingerie',
 'bridgeless bra',
 'bulge',
 'bursting ass',
 'buruma pull',
 'bust cup',
 'c-string',
 'cameltoe',
 'camisole lift',
 'cammy white (cosplay)',
 'camouflage bikini',
 'card between breasts',
 'cat ear panties',
 'cinderella bust',
 'cleavage reach',
 'condom',
 'condom in mouth',
 'condom packet strip',
 'condom wrapper',
 

In [9]:
for q_tag in general_questionable_tags.copy():
    if q_tag not in general_questionable_tag_counts:
        general_questionable_tags.remove(q_tag)
    elif q_tag in general_general_tags:
        general_questionable_tags.remove(q_tag)
    elif q_tag in general_sensitive_tags:
        general_questionable_tags.remove(q_tag)

general_questionable_tags

{'69',
 'anal beads',
 'anus peek',
 'applying sunscreen',
 'ass on glass',
 'ass tattoo',
 'backless panties',
 'backwards virgin killer sweater',
 'bandaid on pussy',
 'between labia',
 'bike shorts pull',
 'bikini around one leg',
 'bikini bottom aside',
 'black pasties',
 'black pubic hair',
 'blonde pubic hair',
 'bloomers pull',
 'blue nipples',
 'blue pubic hair',
 'bound thighs',
 'box tie',
 'breast milk',
 'breast slip',
 'brown pubic hair',
 'bulge lift',
 'bulge press',
 'bulge to ass',
 'bulges touching',
 'butt plug',
 'censored nipples',
 'chastity belt',
 'cleave gag',
 'cleft of venus',
 'clitoris piercing',
 'clitoris slip',
 'clothed female nude female',
 'clothed masturbation',
 'clothed sex',
 'colored nipples',
 'colored pubic hair',
 'condom in clothes',
 'consensual tentacles',
 'convenient head',
 'covered anus',
 'covered clitoris',
 'covered penis',
 'covered piercing',
 "covering another's breasts",
 'covering nipples',
 'covering one breast',
 'cross pastie

In [10]:
for e_tag in general_explicit_tags.copy():
    if e_tag not in general_explicit_tag_counts:
        general_explicit_tags.remove(e_tag)
    elif e_tag in general_general_tags:
        general_explicit_tags.remove(e_tag)
    elif e_tag in general_sensitive_tags:
        general_explicit_tags.remove(e_tag)
    elif e_tag in general_questionable_tags:
        general_explicit_tags.remove(e_tag)

general_explicit_tags

{'after anal',
 'after cunnilingus',
 'after ejaculation',
 'after fingering',
 'after footjob',
 'after handjob',
 'after insertion',
 'after masturbation',
 'after paizuri',
 'after rape',
 'after vaginal',
 'amazon position',
 'anal ball wear',
 'anal fingering',
 'anal fisting',
 'anal fluid',
 'anal hair',
 'anal hook',
 'anilingus',
 'animal penis',
 'anus cutout',
 'armbinder',
 'armpit sex',
 'ass freckles',
 'ass hair',
 'ass-to-ass penetration',
 'assisted masturbation',
 'autoarousal',
 'autofacial',
 'autofellatio',
 'autopaizuri',
 'bad vulva',
 'ball bra',
 'ball busting',
 'bikini top aside',
 'blue plate special (sex)',
 'bootjob',
 'bouncing ass',
 'bouncing penis',
 'breast pump',
 'broken condom',
 'bukkake',
 'buruma around one leg',
 'buruma aside',
 'buttjob',
 'buttjob over clothes',
 'caressing testicles',
 'catheter',
 'cbt',
 'censored with cum',
 'cervical penetration',
 'cervix',
 'chastity cage',
 'clitoral hood',
 'clitoral stimulation',
 'clitoral stimula

In [11]:
print(len(general_general_tags))
print(len(general_sensitive_tag_counts))
print(len(general_questionable_tags))
print(len(general_explicit_tags))

51457
12114
249
434


In [12]:
(
    len(general_general_tags)
    + len(general_sensitive_tag_counts)
    + len(general_questionable_tags)
    + len(general_explicit_tags)
)

64254

## Create copyright/character tag list


In [13]:
copyright_tags = set()
character_tags = set()

copyright_tag_counts = {}
character_tag_counts = {}


def load_copyright_and_character_tags(examples):
    for i, copyright in enumerate(examples["copyright"]):
        if copyright is None:
            copyright = "original"
        for tag in copyright.split(", "):
            copyright_tags.add(tag)
            if tag not in copyright_tag_counts:
                copyright_tag_counts[tag] = 1
            else:
                copyright_tag_counts[tag] += 1
    for i, character in enumerate(examples["character"]):
        if character is None:
            continue
        for tag in character.split(", "):
            character_tags.add(tag)
            if tag not in character_tag_counts:
                character_tag_counts[tag] = 1
            else:
                character_tag_counts[tag] += 1


ds.map(load_copyright_and_character_tags, batched=True)

Map:   0%|          | 0/6574005 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'copyright', 'character', 'artist', 'general', 'meta', 'rating', 'score', 'created_at'],
    num_rows: 6574005
})

In [15]:
copyright_tag_counts

{'sora no iro mizu no iro': 130,
 'original': 1007957,
 'bastard!!': 192,
 'dragon quest': 10925,
 'dragon quest v': 1624,
 'slayers': 1126,
 'milk junkies': 70,
 'milk junkies 2': 21,
 'nhk (broadcaster)': 96,
 'shinobi hanafuda': 9,
 'yotsubato!': 1298,
 'kamichu!': 260,
 'fate/stay night': 34771,
 'fate (series)': 277240,
 'love hina': 614,
 'yurine ~onee-sama ga oshiete kureta~': 19,
 'ragnarok online': 10064,
 'vampire (game)': 7941,
 'maria-sama ga miteru': 2809,
 'kinnikuman': 287,
 'street fighter': 20016,
 'gunslinger girl': 456,
 'idolmaster': 191811,
 'idolmaster (classic)': 33476,
 'idolmaster 1': 3646,
 'apple inc.': 767,
 'bleach': 8660,
 'fate/hollow ataraxia': 3542,
 'kanon': 2387,
 'shichinin no online gamers': 55,
 'betterman': 59,
 'future gpx cyber formula': 72,
 'yuusha ou gaogaigar': 404,
 'yuusha ou gaogaigar final': 151,
 'yuusha series': 664,
 'yuusha tokkyuu might gaine': 51,
 'disgaea': 4785,
 'makai senki disgaea': 1177,
 'gagraphic': 208,
 'futabu': 83,
 'o

In [16]:
# remove too few tags (less than 100)
copyright_tag_counts = {k: v for k, v in copyright_tag_counts.items() if v >= 100}
character_tag_counts = {k: v for k, v in character_tag_counts.items() if v >= 100}

In [17]:
copyright_tag_counts

{'sora no iro mizu no iro': 130,
 'original': 1007957,
 'bastard!!': 192,
 'dragon quest': 10925,
 'dragon quest v': 1624,
 'slayers': 1126,
 'yotsubato!': 1298,
 'kamichu!': 260,
 'fate/stay night': 34771,
 'fate (series)': 277240,
 'love hina': 614,
 'ragnarok online': 10064,
 'vampire (game)': 7941,
 'maria-sama ga miteru': 2809,
 'kinnikuman': 287,
 'street fighter': 20016,
 'gunslinger girl': 456,
 'idolmaster': 191811,
 'idolmaster (classic)': 33476,
 'idolmaster 1': 3646,
 'apple inc.': 767,
 'bleach': 8660,
 'fate/hollow ataraxia': 3542,
 'kanon': 2387,
 'yuusha ou gaogaigar': 404,
 'yuusha ou gaogaigar final': 151,
 'yuusha series': 664,
 'disgaea': 4785,
 'makai senki disgaea': 1177,
 'gagraphic': 208,
 'onegai teacher': 399,
 'real life': 6453,
 'makai tenshi djibril': 283,
 'giant robo': 181,
 'final fantasy': 56363,
 'final fantasy x': 1831,
 'final fantasy x-2': 483,
 'tsukuyomi moonphase': 309,
 'yami to boushi to hon no tabibito': 340,
 'top wo nerae!': 436,
 'xenogears

In [18]:
copyright_tags = [tag for tag in copyright_tags if tag in copyright_tag_counts]
character_tags = [tag for tag in character_tags if tag in character_tag_counts]

In [19]:
len(copyright_tags)

3439

In [20]:
len(character_tags)

12171

## Save tags


In [21]:
with open("./general-general.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(general_general_tags))
with open("./general-sensitive.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(general_sensitive_tags))
with open("./general-questionable.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(general_questionable_tags))
with open("./general-explicit.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(general_explicit_tags))
with open("./copyright.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(copyright_tags))
with open("./character.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(character_tags))