# Tagging dataset

First, you need to put the images through the Auto1 interrogation process. 

In [1]:
from zoo.images.tasks import Automatic1111
from pathlib import Path
from yo_fluq_ds import *
from PIL import Image, UnidentifiedImageError
from zoo.images.tools import ConvertImage

basepath = Path('samples')
extraction_folder = basepath/'extraction'

def interrogate(folder):
    Automatic1111.run_if_absent()

    for image_file in Query.folder(folder,'**/*.*').feed(list, Query.en, fluq.with_progress_bar()):
        try:
            image = Image.open(image_file)
        except UnidentifiedImageError:
            continue
        b64 = ConvertImage(image).to_base64()
        for model in ['clip','deepdanbooru']:
            result = Automatic1111.run_request_sync('/sdapi/v1/interrogate', payload=dict(image=b64,model=model))
            FileIO.write_json(result, f'{image_file}.{model}.json')
            
#interrogate(extraction_folder)   

Then, we assemble all the tags from all the interogations. 

In [2]:
from zoo.images.tools.annotator import ImageTagAnnotator

df = ImageTagAnnotator.read_tags(extraction_folder)
df.to_parquet('annotation.parquet')
df.head(3)

Unnamed: 0,word,interrogation_path,image_path,category,annotation_path,scores
0,1girl,dv\1.png.deepdanbooru.json,dv\1.png,dv,dv\1.png.annotation.json,1
1,belt,dv\1.png.deepdanbooru.json,dv\1.png,dv,dv\1.png.annotation.json,1
2,blush,dv\1.png.deepdanbooru.json,dv\1.png,dv,dv\1.png.annotation.json,1


Remember, your job is to prune all the tags that are describing your charachter.

If you look at the tags statistics, you may have a good idea of what to prune:

In [3]:
df.groupby('word').size().sort_values(ascending=False).head(20)

word
1girl           11
orange_hair     11
solo            11
orange_eyes     11
tree            10
sky              9
outdoors         9
belt             8
cloud            8
lake             8
night            7
star_\(sky\)     7
navel            7
twintails        7
starry_sky       7
breasts          6
water            6
river            6
moon             6
bush             6
dtype: int64

In [4]:
ban_list = ['orange_hair', 'orange_eyes', 'twintails', 'short_hair', 'medium_breasts', 'cleavage']

You can do more sophisticated analysis here: the `ImageTagAnnotator.read_tags` will also works in case of multiple charachters. Looking at TF/IDF, for instance, might be useful to predict features that only one of the charachters have. 

Now we are ready to run the annotation panel. We have already apply some annotation, but you may review it.

In [5]:
import os 

annotation_folder = basepath/'annotations'
os.makedirs(annotation_folder, exist_ok=True)
ImageTagAnnotator(df, extraction_folder, annotation_folder, ban_list, True).render()

VBox(children=(HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\x00\x00\x00\x02\x0…