In [None]:
from google.colab import drive
import os
import sys

from PIL import Image
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

import glob
from pathlib import Path
from tqdm import tqdm
import pandas as pd

In [None]:
drive.mount("/content/drive")


%pip install colorgram.py webcolors

%cp /content/drive/MyDrive/VKR_Dataset/logos.zip /content
%cp /content/drive/MyDrive/VKR_Dataset/self_assignments.xlsx /content
%cp /content/drive/MyDrive/VKR_Dataset/*.tsv /content

%unzip logos.zip
%rm -rf /content/__MACOSX

%sudo apt-get install swig3.0
%sudo pip install jamspell
%wget https://github.com/bakwc/JamSpell-models/raw/master/ru.tar.gz
%tar -xvf ru.tar.gz
%pip install deep_translator

### [LEGACY] Генерация описания с использованием BLIP

In [None]:
# installing BLIP requirements
os.system("pip3 install transformers==4.15.0 timm==0.4.12 fairscale==0.4.4")
os.system("git clone https://github.com/salesforce/BLIP")
os.chdir("BLIP")

from models.blip import blip_decoder

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def load_image(image_url, image_size, device):
    raw_image = Image.open(image_url).convert("RGB")

    w, h = raw_image.size

    transform = transforms.Compose(
        [
            transforms.Resize(
                (image_size, image_size), interpolation=InterpolationMode.BICUBIC
            ),
            transforms.ToTensor(),
            transforms.Normalize(
                (0.48145466, 0.4578275, 0.40821073),
                (0.26862954, 0.26130258, 0.27577711),
            ),
        ]
    )
    image = transform(raw_image).unsqueeze(0).to(device)
    return image

Загрузка модели BLIP:

In [None]:
image_dict = {"image": [], "text": []}
image_size = 120
model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"

model = blip_decoder(pretrained=model_url, image_size=image_size, vit="large")
model.eval()
model = model.to(device)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

  0%|          | 0.00/1.66G [00:00<?, ?B/s]

reshape position embedding from 576 to 49
load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth


In [None]:
# Подсчет общего количества сэмплов в датасете
total_samples = len(list(Path("/content/logos/").rglob("*.png")))

Генерация описаний к датасету при помощи модели BLIP large:

In [None]:
with torch.no_grad():
    with tqdm(total=total_samples) as prbar:
        for img_url in glob.glob("/content/logos/*/*.png"):
            image = load_image(image_url=img_url, image_size=image_size, device=device)
            # beam search
            caption = model.generate(
                image, sample=False, num_beams=3, max_length=40, min_length=10
            )
            image_dict["image"].append("/".join(img_url.split("/")[-2:]))
            image_dict["text"].append(caption[0])
            prbar.update(1)

100%|██████████| 805/805 [04:53<00:00,  2.75it/s]


In [None]:
cap = pd.DataFrame(image_dict)
cap["sphere"] = cap["image"].apply(lambda s: s.split("/")[0])
cap["text"] = "a logo of " + cap["sphere"] + " with " + cap["text"]
cap = cap.drop("sphere", axis=1)
cap

Unnamed: 0,image,text
0,cafe restaurant bar/105.png,a logo of cafe restaurant bar with a set of nu...
1,cafe restaurant bar/118.png,a logo of cafe restaurant bar with a poster wi...
2,cafe restaurant bar/68.png,a logo of cafe restaurant bar with a set of or...
3,cafe restaurant bar/114.png,a logo of cafe restaurant bar with a logo for ...
4,cafe restaurant bar/100.png,a logo of cafe restaurant bar with a logo for ...
...,...,...
800,printing house/557.png,a logo of printing house with a set of letters...
801,printing house/562.png,a logo of printing house with a stack of four ...
802,printing house/559.png,a logo of printing house with a poster with a ...
803,printing house/566.png,a logo of printing house with a black and whit...


### Добавление отрасли компании:

In [None]:
image_dict = {"image": [], "text": []}
for img_url in glob.glob("/content/logos/*/*.png"):
    img_url = "/".join(img_url.split("/")[-2:])
    image_dict["image"].append(img_url.split("/")[-1])
    image_dict["text"].append("a logo of " + img_url.split("/")[0] + ", ")
industry = pd.DataFrame(image_dict)
industry

Unnamed: 0,image,text
0,717.png,"a logo of food delivery,"
1,723.png,"a logo of food delivery,"
2,726.png,"a logo of food delivery,"
3,729.png,"a logo of food delivery,"
4,716.png,"a logo of food delivery,"
...,...,...
800,414.png,"a logo of library book store,"
801,422.png,"a logo of library book store,"
802,428.png,"a logo of library book store,"
803,430.png,"a logo of library book store,"


### Обработка разметки из Toloka:

In [None]:
import jamspell
from deep_translator import GoogleTranslator

Объявление переводчика и орфографического корректора:

In [None]:
jsp = jamspell.TSpellCorrector()
translator = GoogleTranslator(source="ru", target="en")
assert jsp.LoadLangModel("ru_small.bin")

In [None]:
def preprocess_toloka(toloka_df):
    toloka_df = toloka_df.rename(
        {"INPUT:image": "image", "OUTPUT:result": "text"}, axis=1
    )
    toloka_df = toloka_df[["image", "text"]]
    toloka_df["text"] = toloka_df["text"].apply(
        lambda s: translator.translate(jsp.FixFragment(s))
    )
    toloka_df["text"] = toloka_df["text"].apply(lambda s: s.replace('"', ""))
    toloka_df["image"] = toloka_df["image"].apply(lambda s: s.split("/")[-1])
    return toloka_df

In [None]:
# assignments – файлы с результатамы разметки, выгруженные из Толоки
assignments = []
for img_url in glob.glob("/content/*.tsv"):
    assignments.append(img_url)

toloka_dfs = []
for assignment in assignments:
    toloka_df = pd.read_csv(assignment, sep="\t")
    toloka_dfs.append(preprocess_toloka(toloka_df))

toloka_df = pd.concat(toloka_dfs).reset_index(drop=True)
toloka_df

Unnamed: 0,image,text
0,530.png,"A cozy corner where there is an armchair, and ..."
1,531.png,A futuristic figure in gold and brown hues wit...
2,525.png,A cubic representation of an armchair with a d...
3,519.png,"A golden-colored rhombus in the center, in whi..."
4,533.png,soft stool with legs in yellow and its shades ...
...,...,...
623,487.png,"Flowers are depicted on a black background, at..."
624,486.png,An image in the form of a window is drawn on a...
625,492.png,In the center is an image of a girl with long ...
626,479.png,"In the middle there is an inscription, on the ..."


Добавление описаний с Толоки к общим описаниям:

In [None]:
toloka_full_markup = industry.merge(toloka_df, how="inner", on="image")
toloka_full_markup["text"] = toloka_full_markup["text_x"] + toloka_full_markup["text_y"]
toloka_full_markup = toloka_full_markup[["image", "text"]]
toloka_full_markup

Unnamed: 0,image,text
0,717.png,"a logo of food delivery, black crossed fork an..."
1,723.png,"a logo of food delivery, A circle is depicted,..."
2,726.png,"a logo of food delivery, blue hexagon as a sym..."
3,729.png,"a logo of food delivery, An exclamation mark w..."
4,716.png,"a logo of food delivery, triangular piece of f..."
...,...,...
623,414.png,"a logo of library book store, A wavy straight ..."
624,422.png,"a logo of library book store, Opened book. Beh..."
625,428.png,"a logo of library book store, A book is open o..."
626,430.png,"a logo of library book store, In the middle th..."


Загрузка собственной разметки и объединение с разметкой с Толоки:

In [None]:
self_full_markup = pd.read_excel("self_assignments.xlsx")
self_full_markup = self_full_markup[["image", "text"]]
self_full_markup["image"] = self_full_markup["image"].apply(lambda s: s.split("/")[-1])
self_full_markup

Unnamed: 0,image,text
0,105.png,"a logo of cafe restaurant bar with spoon, knif..."
1,118.png,a logo of cafe restaurant bar with an illustra...
2,68.png,a logo of cafe restaurant bar with beautiful p...
3,114.png,a logo of cafe restaurant bar with the for and...
4,100.png,a logo of cafe restaurant bar pizzeria with a ...
...,...,...
169,569.png,a logo of media news with A square with a cont...
170,586.png,a logo of media news with A circle with the le...
171,573.png,a logo of media news with A square with a roun...
172,574.png,a logo of media news with A circle with a tria...


In [None]:
full_markup = pd.concat([self_full_markup, toloka_full_markup]).reset_index(drop=True)
full_markup.to_csv("logo_markup_total.csv", index=False)

### Извлечение цвета

In [None]:
import colorgram
from scipy.spatial import KDTree
from webcolors import (
    CSS3_HEX_TO_NAMES,
    hex_to_rgb,
)

Функция для конвертации RGB представления цвета в текстовое с использованием базы цветов CSS3 и алгоритма KDTree:

In [None]:
def convert_rgb_to_names(rgb_tuple):
    names = []
    rgb_values = []
    for color_hex, color_name in CSS3_HEX_TO_NAMES.items():
        names.append(color_name)
        rgb_values.append(hex_to_rgb(color_hex))

    kdt_db = KDTree(rgb_values)
    distance, index = kdt_db.query(rgb_tuple)
    return names[index]

Для каждого изображения извлекаем три основных цветоа и переводим названия из RGB в текстовое представление. Для фона оставляем один цвет, а для переднего плана два:

In [None]:
colors_data = {"image": [], "background": [], "foreground": []}
with tqdm(total=total_samples) as prbar:
    for img_url in glob.glob("/content/logos/*/*.png"):
        colors = colorgram.extract(img_url, 3)
        color_names = []
        colors_data["image"].append(img_url.split("/")[-1])

        bg_rgb = colors[0].rgb
        colors_data["background"].append(
            convert_rgb_to_names((bg_rgb[0], bg_rgb[1], bg_rgb[2]))
        )

        fg_colors = []
        for i in range(1, 3):
            fg_rgb = colors[i].rgb
            fg_colors.append(convert_rgb_to_names((fg_rgb[0], fg_rgb[1], fg_rgb[2])))
        colors_data["foreground"].append(fg_colors[0] + ", " + fg_colors[1])
        prbar.update(1)

100%|██████████| 805/805 [25:58<00:00,  1.94s/it]


In [None]:
colors_df = pd.DataFrame(colors_data)
full_markup = full_markup.merge(colors_df, how="inner", on="image")
full_markup["text"] = (
    full_markup["text"] + ", " + full_markup["background"] + " background"
)
full_markup["text"] = (
    full_markup["text"] + ", " + full_markup["foreground"] + " foreground"
)
full_markup["text"] = full_markup["text"] + ", minimalism, modern"
full_markup = full_markup[["image", "text"]]

Создадим датасет для загрузки в HF Hub:

In [None]:
hf_dataset = full_markup.copy()
hf_dataset = hf_dataset.rename({'image': 'file_name'}, axis=1)

In [None]:
hf_dataset['file_name'] = hf_dataset['file_name'].apply(lambda text: re.sub(r'(["])[\s+]?(\1)+', "", text))
hf_dataset['split'] = hf_dataset['file_name'].str.split('.png,')
hf_dataset['file_name'] = hf_dataset['split'].apply(lambda row: row[0])
hf_dataset['file_name'] += ".png"
hf_dataset['text'] = hf_dataset['split'].apply(lambda row: row[1])
hf_dataset = hf_dataset.drop('split', axis=1)
hf_dataset.to_csv('/Users/ivsidorov1/Desktop/logos_all/metadata.csv', index=False)

In [None]:
from datasets import load_dataset
from dotenv import load_dotenv


load_dotenv()
read_token = os.environ["READ_HF_TOKEN"]
write_token = os.environ["WRITE_HF_TOKEN"]
os.system(f"!huggingface-cli login --token {write_token}")

dataset = load_dataset("logos_all", split="train")
dataset.push_to_hub("eewwann/logo-dataset", private=True)

Переведем метаданные в формат для дообучения с использованием алгоритма LoHa:

In [None]:
def row_to_txt(row):
    file_name = row["file_name"].split(".")[0]
    text = row["text"]
    with open(f"loha_dataset/{file_name}.txt", "w") as f:
        f.write(text)

In [None]:
df = pd.read_csv('logos_all/metadata.csv')
df.apply(row_to_txt, axis=1);

### [LEGACY] BLIP с использованием Hugging Face transformers:

Попробуем BLIP large:

In [None]:
%pip install transformers==4.26.1

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration


processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-large"
).to("cuda")

In [None]:
image_dict_new = {"image": [], "text": []}
with tqdm(total=total_samples) as prbar:
    for img_url in glob.glob("/content/logos/*/*.png"):
        raw_image = Image.open(img_url).convert("RGB")
        text = "a logo of"
        inputs = processor(raw_image, text, return_tensors="pt").to("cuda")
        out = model.generate(**inputs)
        image_dict_new["image"].append("/".join(img_url.split("/")[-2:]))
        image_dict_new["text"].append(
            processor.decode(out[0], skip_special_tokens=True)
        )
        prbar.update(1)

100%|██████████| 805/805 [05:24<00:00,  2.48it/s]


In [None]:
cap_new = pd.DataFrame(image_dict_new)
cap_new["sphere"] = cap_new["image"].apply(lambda s: s.split("/")[0])
cap_new["text"] = cap_new["text"].apply(lambda s: s.split("a logo of")[1])
cap_new["text"] = "a logo of " + cap_new["sphere"] + " with " + cap_new["text"]
cap_new = cap_new.drop("sphere", axis=1)
cap_new

Unnamed: 0,image,text
0,cafe restaurant bar/105.png,a logo of cafe restaurant bar with a fork and...
1,cafe restaurant bar/118.png,a logo of cafe restaurant bar with a restaura...
2,cafe restaurant bar/68.png,a logo of cafe restaurant bar with a cocktail...
3,cafe restaurant bar/114.png,a logo of cafe restaurant bar with oysters on...
4,cafe restaurant bar/100.png,a logo of cafe restaurant bar with a pizza wi...
...,...,...
800,printing house/557.png,a logo of printing house with a house with a ...
801,printing house/562.png,a logo of printing house with a printer print...
802,printing house/559.png,a logo of printing house with a printer print...
803,printing house/566.png,a logo of printing house with a computer moni...
